Skip to content

Commit 4ec7cf8

Browse files
authored
Additional operator edge case fixes (#2007)
Fix a few edge-case situations: - Restart evicted pods that have reached the terminal `Failed` state with reason `Evicted`, by just recreating them. These pods will not be automatically retried, so need to be recreated (usually happens due to memory pressure from the node) - Don't treat containers in ContainerCreating as running, even though this state is usually quick, its possible for containers to get stuck there, and will improve accuracy of exec seconds tracking. - Consolidate state transition for running states, either sets to running or to pending-wait/generate-wacz/upload-wacz and allows changing from to either of these states from each other or waiting_capacity
1 parent 3923996 commit 4ec7cf8

File tree

2 files changed

+48
-42
lines changed

2 files changed

+48
-42
lines changed

backend/btrixcloud/operator/crawls.py

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -305,19 +305,21 @@ async def sync_crawls(self, data: MCSyncData):
305305
"resyncAfterSeconds": status.resync_after,
306306
}
307307

308-
def _load_redis(self, params, status, children):
308+
def _load_redis(self, params, status: CrawlStatus, children):
309309
name = f"redis-{params['id']}"
310310
has_pod = name in children[POD]
311311

312312
pod_info = status.podStatus[name]
313313
params["name"] = name
314314
params["cpu"] = pod_info.newCpu or params.get("redis_cpu")
315315
params["memory"] = pod_info.newMemory or params.get("redis_memory")
316-
restart = pod_info.should_restart_pod() and has_pod
317-
if restart:
318-
print(f"Restart {name}")
316+
restart_reason = None
317+
if has_pod:
318+
restart_reason = pod_info.should_restart_pod()
319+
if restart_reason:
320+
print(f"Restarting {name}, reason: {restart_reason}")
319321

320-
params["init_redis"] = status.initRedis and not restart
322+
params["init_redis"] = status.initRedis and not restart_reason
321323

322324
return self.load_from_yaml("redis.yaml", params)
323325

@@ -362,7 +364,7 @@ async def _load_qa_configmap(self, params, children):
362364
params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
363365
return self.load_from_yaml("qa_configmap.yaml", params)
364366

365-
def _load_crawler(self, params, i, status, children):
367+
def _load_crawler(self, params, i, status: CrawlStatus, children):
366368
name = f"crawl-{params['id']}-{i}"
367369
has_pod = name in children[POD]
368370

@@ -387,11 +389,12 @@ def _load_crawler(self, params, i, status, children):
387389
else:
388390
params["memory_limit"] = self.k8s.max_crawler_memory_size
389391
params["workers"] = params.get(worker_field) or 1
390-
params["do_restart"] = (
391-
pod_info.should_restart_pod() or params.get("force_restart")
392-
) and has_pod
393-
if params.get("do_restart"):
394-
print(f"Restart {name}")
392+
params["do_restart"] = False
393+
if has_pod:
394+
restart_reason = pod_info.should_restart_pod(params.get("force_restart"))
395+
if restart_reason:
396+
print(f"Restarting {name}, reason: {restart_reason}")
397+
params["do_restart"] = True
395398

396399
return self.load_from_yaml("crawler.yaml", params)
397400

@@ -523,7 +526,7 @@ async def set_state(
523526
finished=finished,
524527
stats=stats,
525528
)
526-
if res:
529+
if res and status.state != state:
527530
print(f"Setting state: {status.state} -> {state}, {crawl.id}")
528531
status.state = state
529532
return True
@@ -804,14 +807,6 @@ async def sync_crawl_state(
804807
status.resync_after = self.fast_retry_secs
805808
return status
806809

807-
# ensure running state is set
808-
await self.set_state(
809-
"running",
810-
status,
811-
crawl,
812-
allowed_from=["starting", "waiting_capacity"],
813-
)
814-
815810
# update lastActiveTime if crawler is running
816811
if crawler_running:
817812
status.lastActiveTime = to_k8s_date(dt_now())
@@ -874,25 +869,32 @@ def sync_pod_status(
874869
try:
875870
for name, pod in pods.items():
876871
running = False
872+
evicted = False
877873

878874
pstatus = pod["status"]
879875
phase = pstatus["phase"]
880876
role = pod["metadata"]["labels"]["role"]
881877

882878
if phase in ("Running", "Succeeded"):
883879
running = True
880+
elif phase == "Failed" and pstatus.get("reason") == "Evicted":
881+
evicted = True
882+
883+
status.podStatus[name].evicted = evicted
884884

885885
if "containerStatuses" in pstatus:
886886
cstatus = pstatus["containerStatuses"][0]
887887

888-
# consider 'ContainerCreating' as running
889-
waiting = cstatus["state"].get("waiting")
890-
if (
891-
phase == "Pending"
892-
and waiting
893-
and waiting.get("reason") == "ContainerCreating"
894-
):
895-
running = True
888+
# don't consider 'ContainerCreating' as running for now
889+
# may be stuck in this state for other reasons
890+
#
891+
# waiting = cstatus["state"].get("waiting")
892+
# if (
893+
# phase == "Pending"
894+
# and waiting
895+
# and waiting.get("reason") == "ContainerCreating"
896+
# ):
897+
# running = True
896898

897899
self.handle_terminated_pod(
898900
name, role, status, cstatus["state"].get("terminated")
@@ -1388,24 +1390,20 @@ async def update_crawl_state(
13881390
else:
13891391
await self.fail_crawl(crawl, status, pods, stats)
13901392

1391-
# check for other statuses
1393+
# check for other statuses, default to "running"
13921394
else:
1393-
new_status: Optional[TYPE_RUNNING_STATES] = None
1394-
if status_count.get("running"):
1395-
if status.state in ("generate-wacz", "uploading-wacz", "pending-wacz"):
1396-
new_status = "running"
1395+
new_status: TYPE_RUNNING_STATES = "running"
13971396

1398-
elif status_count.get("generate-wacz"):
1397+
if status_count.get("generate-wacz"):
13991398
new_status = "generate-wacz"
14001399
elif status_count.get("uploading-wacz"):
14011400
new_status = "uploading-wacz"
14021401
elif status_count.get("pending-wait"):
14031402
new_status = "pending-wait"
14041403

1405-
if new_status:
1406-
await self.set_state(
1407-
new_status, status, crawl, allowed_from=RUNNING_STATES
1408-
)
1404+
await self.set_state(
1405+
new_status, status, crawl, allowed_from=RUNNING_AND_WAITING_STATES
1406+
)
14091407

14101408
return status
14111409

backend/btrixcloud/operator/models.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ class PodInfo(BaseModel):
134134
newMemory: Optional[int] = None
135135
signalAtMem: Optional[int] = None
136136

137+
evicted: Optional[bool] = False
138+
137139
def dict(self, *a, **kw):
138140
res = super().dict(*a, **kw)
139141
percent = {
@@ -168,15 +170,21 @@ def get_percent_storage(self) -> float:
168170
else 0
169171
)
170172

171-
def should_restart_pod(self):
173+
def should_restart_pod(self, forced: bool = False) -> Optional[str]:
172174
"""return true if pod should be restarted"""
173175
if self.newMemory and self.newMemory != self.allocated.memory:
174-
return True
176+
return "newMemory"
175177

176178
if self.newCpu and self.newCpu != self.allocated.cpu:
177-
return True
179+
return "newCpu"
180+
181+
if self.evicted:
182+
return "evicted"
183+
184+
if forced:
185+
return "forced"
178186

179-
return False
187+
return None
180188

181189

182190
# ============================================================================

0 commit comments

Comments
 (0)