@@ -305,19 +305,21 @@ async def sync_crawls(self, data: MCSyncData):
305
305
"resyncAfterSeconds" : status .resync_after ,
306
306
}
307
307
308
- def _load_redis (self , params , status , children ):
308
+ def _load_redis (self , params , status : CrawlStatus , children ):
309
309
name = f"redis-{ params ['id' ]} "
310
310
has_pod = name in children [POD ]
311
311
312
312
pod_info = status .podStatus [name ]
313
313
params ["name" ] = name
314
314
params ["cpu" ] = pod_info .newCpu or params .get ("redis_cpu" )
315
315
params ["memory" ] = pod_info .newMemory or params .get ("redis_memory" )
316
- restart = pod_info .should_restart_pod () and has_pod
317
- if restart :
318
- print (f"Restart { name } " )
316
+ restart_reason = None
317
+ if has_pod :
318
+ restart_reason = pod_info .should_restart_pod ()
319
+ if restart_reason :
320
+ print (f"Restarting { name } , reason: { restart_reason } " )
319
321
320
- params ["init_redis" ] = status .initRedis and not restart
322
+ params ["init_redis" ] = status .initRedis and not restart_reason
321
323
322
324
return self .load_from_yaml ("redis.yaml" , params )
323
325
@@ -362,7 +364,7 @@ async def _load_qa_configmap(self, params, children):
362
364
params ["qa_source_replay_json" ] = crawl_replay .json (include = {"resources" })
363
365
return self .load_from_yaml ("qa_configmap.yaml" , params )
364
366
365
- def _load_crawler (self , params , i , status , children ):
367
+ def _load_crawler (self , params , i , status : CrawlStatus , children ):
366
368
name = f"crawl-{ params ['id' ]} -{ i } "
367
369
has_pod = name in children [POD ]
368
370
@@ -387,11 +389,12 @@ def _load_crawler(self, params, i, status, children):
387
389
else :
388
390
params ["memory_limit" ] = self .k8s .max_crawler_memory_size
389
391
params ["workers" ] = params .get (worker_field ) or 1
390
- params ["do_restart" ] = (
391
- pod_info .should_restart_pod () or params .get ("force_restart" )
392
- ) and has_pod
393
- if params .get ("do_restart" ):
394
- print (f"Restart { name } " )
392
+ params ["do_restart" ] = False
393
+ if has_pod :
394
+ restart_reason = pod_info .should_restart_pod (params .get ("force_restart" ))
395
+ if restart_reason :
396
+ print (f"Restarting { name } , reason: { restart_reason } " )
397
+ params ["do_restart" ] = True
395
398
396
399
return self .load_from_yaml ("crawler.yaml" , params )
397
400
@@ -523,7 +526,7 @@ async def set_state(
523
526
finished = finished ,
524
527
stats = stats ,
525
528
)
526
- if res :
529
+ if res and status . state != state :
527
530
print (f"Setting state: { status .state } -> { state } , { crawl .id } " )
528
531
status .state = state
529
532
return True
@@ -804,14 +807,6 @@ async def sync_crawl_state(
804
807
status .resync_after = self .fast_retry_secs
805
808
return status
806
809
807
- # ensure running state is set
808
- await self .set_state (
809
- "running" ,
810
- status ,
811
- crawl ,
812
- allowed_from = ["starting" , "waiting_capacity" ],
813
- )
814
-
815
810
# update lastActiveTime if crawler is running
816
811
if crawler_running :
817
812
status .lastActiveTime = to_k8s_date (dt_now ())
@@ -874,25 +869,32 @@ def sync_pod_status(
874
869
try :
875
870
for name , pod in pods .items ():
876
871
running = False
872
+ evicted = False
877
873
878
874
pstatus = pod ["status" ]
879
875
phase = pstatus ["phase" ]
880
876
role = pod ["metadata" ]["labels" ]["role" ]
881
877
882
878
if phase in ("Running" , "Succeeded" ):
883
879
running = True
880
+ elif phase == "Failed" and pstatus .get ("reason" ) == "Evicted" :
881
+ evicted = True
882
+
883
+ status .podStatus [name ].evicted = evicted
884
884
885
885
if "containerStatuses" in pstatus :
886
886
cstatus = pstatus ["containerStatuses" ][0 ]
887
887
888
- # consider 'ContainerCreating' as running
889
- waiting = cstatus ["state" ].get ("waiting" )
890
- if (
891
- phase == "Pending"
892
- and waiting
893
- and waiting .get ("reason" ) == "ContainerCreating"
894
- ):
895
- running = True
888
+ # don't consider 'ContainerCreating' as running for now
889
+ # may be stuck in this state for other reasons
890
+ #
891
+ # waiting = cstatus["state"].get("waiting")
892
+ # if (
893
+ # phase == "Pending"
894
+ # and waiting
895
+ # and waiting.get("reason") == "ContainerCreating"
896
+ # ):
897
+ # running = True
896
898
897
899
self .handle_terminated_pod (
898
900
name , role , status , cstatus ["state" ].get ("terminated" )
@@ -1388,24 +1390,20 @@ async def update_crawl_state(
1388
1390
else :
1389
1391
await self .fail_crawl (crawl , status , pods , stats )
1390
1392
1391
- # check for other statuses
1393
+ # check for other statuses, default to "running"
1392
1394
else :
1393
- new_status : Optional [TYPE_RUNNING_STATES ] = None
1394
- if status_count .get ("running" ):
1395
- if status .state in ("generate-wacz" , "uploading-wacz" , "pending-wacz" ):
1396
- new_status = "running"
1395
+ new_status : TYPE_RUNNING_STATES = "running"
1397
1396
1398
- elif status_count .get ("generate-wacz" ):
1397
+ if status_count .get ("generate-wacz" ):
1399
1398
new_status = "generate-wacz"
1400
1399
elif status_count .get ("uploading-wacz" ):
1401
1400
new_status = "uploading-wacz"
1402
1401
elif status_count .get ("pending-wait" ):
1403
1402
new_status = "pending-wait"
1404
1403
1405
- if new_status :
1406
- await self .set_state (
1407
- new_status , status , crawl , allowed_from = RUNNING_STATES
1408
- )
1404
+ await self .set_state (
1405
+ new_status , status , crawl , allowed_from = RUNNING_AND_WAITING_STATES
1406
+ )
1409
1407
1410
1408
return status
1411
1409
0 commit comments