@@ -472,45 +472,84 @@ async def application_exists(self, app_name: str) -> bool:
472472 logger .debug (f"Application { app_name } exists: { exists } " )
473473 return exists
474474
475- async def wait_for_application_deletion (self , app_name : str , max_retries : int = 5 , retry_delay : int = 3 ) -> bool :
475+ async def wait_for_application_deletion (
476+ self ,
477+ app_name : str ,
478+ max_retries : int = 5 ,
479+ retry_delay : int = 3 ,
480+ kubectl_connector : Any = None ,
481+ namespace : str = "rig-prd-operations" ,
482+ ) -> bool :
476483 """
477484 Wait for an ArgoCD application to be fully deleted.
478485
486+ ArgoCD is queried first - it is, and ought to remain, our primary source of
487+ truth. But its API has proven untrustworthy under control-plane stress: it
488+ returns 'permission denied' to an admin caller for applications that still
489+ exist, conflating "gone", "can't see it", and "I'm stalled". So we never treat
490+ that response as "deleted". When ArgoCD's answer is anything but a confident
491+ "still exists", we double-check against the Kubernetes API, which fails
492+ honestly (the object, a clean NotFound, or a distinguishable error).
493+
479494 Args:
480495 app_name: Name of the application to wait for
481496 max_retries: Maximum number of retries
482497 retry_delay: Delay between retries in seconds
498+ kubectl_connector: Connector used to confirm absence against the Kubernetes
499+ API when ArgoCD is ambiguous. Without it, an ambiguous ArgoCD answer
500+ cannot be confirmed and deletion is reported as unconfirmed (False).
501+ namespace: Namespace holding the ArgoCD Application CR
483502
484503 Returns:
485- True if application was deleted (or permission denied, indicating AppProject is gone),
486- False if it still exists after max retries
504+ True only if the application is confirmed deleted, False otherwise.
487505 """
488506 import asyncio
489507
490508 logger .info (f"Waiting for application deletion: { app_name } (max { max_retries } retries)" )
491509
510+ async def _confirmed_gone_via_k8s () -> bool :
511+ # Ground-truth fallback. Only a clean NotFound (False) counts as gone;
512+ # still-present (True) or unknown (None) must not be read as deleted.
513+ if kubectl_connector is None :
514+ return False
515+ return (await kubectl_connector .argocd_application_exists (app_name , namespace )) is False
516+
492517 for attempt in range (max_retries ):
493518 try :
494519 exists = await self .application_exists (app_name )
495- if not exists :
496- logger .info (f"Application { app_name } successfully deleted after { attempt + 1 } checks" )
520+ # ArgoCD reports the app gone. A clean 404 is fairly reliable, but since
521+ # the same API lies under stress we still confirm via the Kubernetes API
522+ # when we can before declaring success.
523+ if not exists and (kubectl_connector is None or await _confirmed_gone_via_k8s ()):
524+ logger .info (f"Application { app_name } confirmed deleted after { attempt + 1 } checks" )
497525 return True
498526
499527 logger .debug (f"Application { app_name } still exists, retry { attempt + 1 } /{ max_retries } " )
500- if attempt < max_retries - 1 : # Don't sleep on the last attempt
501- await asyncio .sleep (retry_delay )
502528
503529 except PermissionError :
504- # Permission denied means the AppProject was deleted before the Application.
505- # This indicates the Application is deleted or will be garbage collected.
506- logger .info (f"Application { app_name } - permission denied (AppProject deleted), treating as deleted" )
507- return True
530+ # FALLBACK: 'permission denied' is NOT proof the app is gone. The ArgoCD
531+ # API has proven it cannot be trusted here - it returns this to an admin
532+ # while merely stalled, for apps that still exist. Until ArgoCD can be
533+ # trusted again (and it really ought to be our single source of truth,
534+ # and may be in the future), we double-check the Kubernetes API directly.
535+ if await _confirmed_gone_via_k8s ():
536+ logger .info (
537+ f"Application { app_name } confirmed deleted via Kubernetes API "
538+ f"(ArgoCD returned permission denied; not trusting it as 'deleted')"
539+ )
540+ return True
541+ logger .warning (
542+ f"Application { app_name } : ArgoCD returned permission denied but the Kubernetes API "
543+ f"shows it still present (or could not confirm) - treating as NOT deleted"
544+ )
545+
508546 except Exception as e :
509547 logger .error (f"Error checking application deletion status: { e } " )
510- if attempt < max_retries - 1 :
511- await asyncio .sleep (retry_delay )
512548
513- logger .warning (f"Application { app_name } still exists after { max_retries } retries" )
549+ if attempt < max_retries - 1 : # Don't sleep on the last attempt
550+ await asyncio .sleep (retry_delay )
551+
552+ logger .warning (f"Application { app_name } NOT confirmed deleted after { max_retries } retries" )
514553 return False
515554
516555
0 commit comments