RijksICTGilde
diff --git a/‎bootstrap/rig-system/kustomize/operations-manager/overlays/odcn-production/kustomization.yaml‎
Lines changed: 1 addition & 0 deletions b/‎bootstrap/rig-system/kustomize/operations-manager/overlays/odcn-production/kustomization.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bootstrap/rig-system/kustomize/operations-manager/overlays/odcn-production/prometheusrule-billing.yaml‎
Lines changed: 34 additions & 0 deletions b/‎bootstrap/rig-system/kustomize/operations-manager/overlays/odcn-production/prometheusrule-billing.yaml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎docker-compose.dev.yaml‎
Lines changed: 1 addition & 1 deletion b/‎docker-compose.dev.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎infrastructure/bootstrap/infrastructure/postgresql/database/overlays/odcn/kustomization.yaml‎
Lines changed: 18 additions & 0 deletions b/‎infrastructure/bootstrap/infrastructure/postgresql/database/overlays/odcn/kustomization.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎operations-manager/python/opi/connectors/argo.py‎
Lines changed: 53 additions & 14 deletions b/‎operations-manager/python/opi/connectors/argo.py‎
Lines changed: 53 additions & 14 deletions
diff --git a/‎operations-manager/python/opi/connectors/kubectl.py‎
Lines changed: 30 additions & 0 deletions b/‎operations-manager/python/opi/connectors/kubectl.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎operations-manager/python/opi/connectors/subdomain.py‎
Lines changed: 37 additions & 0 deletions b/‎operations-manager/python/opi/connectors/subdomain.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎operations-manager/python/opi/core/task_handlers_deployment.py‎
Lines changed: 16 additions & 10 deletions b/‎operations-manager/python/opi/core/task_handlers_deployment.py‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎operations-manager/python/opi/core/task_handlers_project.py‎
Lines changed: 10 additions & 1 deletion b/‎operations-manager/python/opi/core/task_handlers_project.py‎
Lines changed: 10 additions & 1 deletion
@@ -11,6 +11,7 @@ resources:
   - ingress.yaml
   - configmap.yaml
   - network-policy.yaml
+  - prometheusrule-billing.yaml
 
   # Additional hostname zad.rijksapp.nl with Let's Encrypt (transition period)
   - issuer-letsencrypt.yaml
 
@@ -0,0 +1,34 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: operations-manager-billing
+  labels:
+    app: operations-manager
+spec:
+  groups:
+    # Voorberekende billing-metric voor de gebruik-en-kostenpagina.
+    # ODCN factureert memory als requests + clamp_min(usage - requests, 0).
+    # Door dit per uur vast te leggen hoeft de kostenpagina geen zware
+    # geneste per-pod aggregatie over 30 dagen meer te draaien (verzoek ODCN).
+    # 1h-interval is bewust: het verbruik wijzigt traag en dit is een indicatie.
+    - name: rig-billing.rules
+      interval: 1h
+      rules:
+        - record: rig:namespace_memory_billed_bytes
+          expr: |
+            sum by (namespace) (
+              sum by (namespace, pod) (
+                kube_pod_resource_request{job="scheduler", namespace=~"rig-prd-.*", resource="memory"}
+              )
+              + clamp_min(
+                  sum by (namespace, pod) (
+                    container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", namespace=~"rig-prd-.*", container!="", image!="", prometheus!="openshift-monitoring/k8s"}
+                  )
+                  - sum by (namespace, pod) (
+                      kube_pod_resource_request{job="scheduler", namespace=~"rig-prd-.*", resource="memory"}
+                    ),
+                  0
+                )
+            )
+        - record: rig:tenant_memory_billed_bytes
+          expr: sum(rig:namespace_memory_billed_bytes)
@@ -14,7 +14,7 @@
 
 services:
   postgres:
-    image: postgres:15-alpine
+    image: postgres:17-alpine
     container_name: opi-postgres-dev
     environment:
       POSTGRES_DB: opi
 
@@ -22,3 +22,21 @@ patches:
       - op: add
         path: /spec/enablePDB
         value: false
+      # Production is a shared multi-tenant instance (Keycloak, Forgejo, OPI +
+      # all project/PR databases). The 512Mi/500m base default is a dev value;
+      # under load (many concurrent project/PR connections + high log volume)
+      # the CNPG instance-manager health endpoint starves and the primary gets
+      # liveness-killed, taking the whole platform DB down on every restart.
+      # Give production real headroom.
+      - op: replace
+        path: /spec/resources/limits/memory
+        value: 2Gi
+      - op: replace
+        path: /spec/resources/limits/cpu
+        value: "2"
+      - op: replace
+        path: /spec/resources/requests/memory
+        value: 512Mi
+      - op: replace
+        path: /spec/resources/requests/cpu
+        value: 250m
@@ -472,45 +472,84 @@ async def application_exists(self, app_name: str) -> bool:
         logger.debug(f"Application {app_name} exists: {exists}")
         return exists
 
-    async def wait_for_application_deletion(self, app_name: str, max_retries: int = 5, retry_delay: int = 3) -> bool:
+    async def wait_for_application_deletion(
+        self,
+        app_name: str,
+        max_retries: int = 5,
+        retry_delay: int = 3,
+        kubectl_connector: Any = None,
+        namespace: str = "rig-prd-operations",
+    ) -> bool:
         """
         Wait for an ArgoCD application to be fully deleted.
 
+        ArgoCD is queried first - it is, and ought to remain, our primary source of
+        truth. But its API has proven untrustworthy under control-plane stress: it
+        returns 'permission denied' to an admin caller for applications that still
+        exist, conflating "gone", "can't see it", and "I'm stalled". So we never treat
+        that response as "deleted". When ArgoCD's answer is anything but a confident
+        "still exists", we double-check against the Kubernetes API, which fails
+        honestly (the object, a clean NotFound, or a distinguishable error).
+
         Args:
             app_name: Name of the application to wait for
             max_retries: Maximum number of retries
             retry_delay: Delay between retries in seconds
+            kubectl_connector: Connector used to confirm absence against the Kubernetes
+                API when ArgoCD is ambiguous. Without it, an ambiguous ArgoCD answer
+                cannot be confirmed and deletion is reported as unconfirmed (False).
+            namespace: Namespace holding the ArgoCD Application CR
 
         Returns:
-            True if application was deleted (or permission denied, indicating AppProject is gone),
-            False if it still exists after max retries
+            True only if the application is confirmed deleted, False otherwise.
         """
         import asyncio
 
         logger.info(f"Waiting for application deletion: {app_name} (max {max_retries} retries)")
 
+        async def _confirmed_gone_via_k8s() -> bool:
+            # Ground-truth fallback. Only a clean NotFound (False) counts as gone;
+            # still-present (True) or unknown (None) must not be read as deleted.
+            if kubectl_connector is None:
+                return False
+            return (await kubectl_connector.argocd_application_exists(app_name, namespace)) is False
+
         for attempt in range(max_retries):
             try:
                 exists = await self.application_exists(app_name)
-                if not exists:
-                    logger.info(f"Application {app_name} successfully deleted after {attempt + 1} checks")
+                # ArgoCD reports the app gone. A clean 404 is fairly reliable, but since
+                # the same API lies under stress we still confirm via the Kubernetes API
+                # when we can before declaring success.
+                if not exists and (kubectl_connector is None or await _confirmed_gone_via_k8s()):
+                    logger.info(f"Application {app_name} confirmed deleted after {attempt + 1} checks")
                     return True
 
                 logger.debug(f"Application {app_name} still exists, retry {attempt + 1}/{max_retries}")
-                if attempt < max_retries - 1:  # Don't sleep on the last attempt
-                    await asyncio.sleep(retry_delay)
 
             except PermissionError:
-                # Permission denied means the AppProject was deleted before the Application.
-                # This indicates the Application is deleted or will be garbage collected.
-                logger.info(f"Application {app_name} - permission denied (AppProject deleted), treating as deleted")
-                return True
+                # FALLBACK: 'permission denied' is NOT proof the app is gone. The ArgoCD
+                # API has proven it cannot be trusted here - it returns this to an admin
+                # while merely stalled, for apps that still exist. Until ArgoCD can be
+                # trusted again (and it really ought to be our single source of truth,
+                # and may be in the future), we double-check the Kubernetes API directly.
+                if await _confirmed_gone_via_k8s():
+                    logger.info(
+                        f"Application {app_name} confirmed deleted via Kubernetes API "
+                        f"(ArgoCD returned permission denied; not trusting it as 'deleted')"
+                    )
+                    return True
+                logger.warning(
+                    f"Application {app_name}: ArgoCD returned permission denied but the Kubernetes API "
+                    f"shows it still present (or could not confirm) - treating as NOT deleted"
+                )
+
             except Exception as e:
                 logger.error(f"Error checking application deletion status: {e}")
-                if attempt < max_retries - 1:
-                    await asyncio.sleep(retry_delay)
 
-        logger.warning(f"Application {app_name} still exists after {max_retries} retries")
+            if attempt < max_retries - 1:  # Don't sleep on the last attempt
+                await asyncio.sleep(retry_delay)
+
+        logger.warning(f"Application {app_name} NOT confirmed deleted after {max_retries} retries")
         return False
 
 
 
@@ -514,6 +514,36 @@ async def delete_argocd_application(self, app_name: str, namespace: str = "rig-p
             logger.error(f"Failed to delete ArgoCD Application '{app_name}': {stderr}")
             return False
 
+    async def argocd_application_exists(self, app_name: str, namespace: str = "rig-prd-operations") -> bool | None:
+        """
+        Ground-truth existence check for an ArgoCD Application CR via the Kubernetes API.
+
+        This is the honest fallback for the ArgoCD API, which returns an ambiguous
+        'permission denied' when its cache is stalled (see
+        ArgoConnector.wait_for_application_deletion). The Kubernetes API instead answers
+        cleanly: the object exists, a definitive NotFound, or a transport error we can
+        tell apart.
+
+        Args:
+            app_name: The name of the ArgoCD Application
+            namespace: The namespace where the Application exists (default: rig-prd-operations)
+
+        Returns:
+            True  - the Application CR exists
+            False - the Application CR is confirmed absent (NotFound)
+            None  - the check itself failed; existence is unknown, so the caller must
+                    not conclude the application is gone
+        """
+        stdout, stderr, code = await self._run_kubectl_command(
+            ["get", "application", app_name, "-n", namespace, "-o", "name"]
+        )
+        if code == 0:
+            return True
+        if "notfound" in stderr.lower().replace(" ", ""):
+            return False
+        logger.warning(f"Could not determine existence of ArgoCD Application '{app_name}': {stderr}")
+        return None
+
     async def wait_for_capsule_tenant_label(self, namespace: str, timeout: int = 30) -> bool:
         """
         Wait for Capsule to assign the tenant label to a namespace.
 
@@ -197,6 +197,43 @@ def is_deployment_domain_approved(
     return True
 
 
+def find_deployments_for_domain_item(project_data: dict[str, Any], item: dict[str, Any]) -> list[str]:
+    """Return names of deployments that use the domain/subdomain in an approval item.
+
+    Used to scope a redeploy on domain/subdomain approval to only the affected
+    deployment(s) instead of reprocessing the whole project.
+
+    A deployment uses an item when its ``base-domain`` equals the item's domain.
+    For ``subdomain`` items the deployment's ``subdomain`` must also match the
+    item's ``name``; ``domain`` items match every deployment on that base domain.
+
+    Args:
+        project_data: Parsed project YAML data
+        item: An approval item: ``{type, domain, name, ...}`` (see _approval_items)
+
+    Returns:
+        Deployment names referencing the item (may be empty).
+    """
+    domain = item.get("domain", "")
+    if not domain:
+        return []
+
+    sub_name = item.get("name", "") if item.get("type") == "subdomain" else None
+
+    result: list[str] = []
+    for dep in project_data.get("deployments", []):
+        if not isinstance(dep, dict):
+            continue
+        if dep.get("base-domain") != domain:
+            continue
+        if sub_name is not None and dep.get("subdomain") != sub_name:
+            continue
+        name = dep.get("name")
+        if name:
+            result.append(name)
+    return result
+
+
 def is_domain_format_dot_based(domain_format: str) -> bool:
     """Check if a domain format uses dot notation between parts.
 
 
@@ -109,18 +109,24 @@ async def handle_delete_deployment(payload: dict, progress: Any) -> dict:
         deletion_results = await project_manager.delete_deployment(project_name, deployment_name, force=True)
 
         success = deletion_results.get("success", False) if isinstance(deletion_results, dict) else False
-        if success:
-            progress.complete_task(delete_task)
-            message = f"Deployment '{deployment_name}' in project '{project_name}' deleted successfully"
-        else:
-            progress.fail_task(delete_task, "Deletion completed with some errors")
-            message = f"Deployment '{deployment_name}' deletion completed with some errors"
-
-        status_msg = "completed successfully" if success else "completed with some errors"
-        logger.info(f"Task: deployment deletion {status_msg} for {project_name}/{deployment_name}")
+        if not success:
+            # Fail the TASK (not just the sub-task) when the delete left errors behind.
+            # Previously this returned a "partial" result and the top-level task reported
+            # success, so partially-failed deletes were treated as done and stale
+            # deployments accumulated (orphaned previews). delete_deployment is idempotent,
+            # so the caller / nightly cleaner safely retries until it converges.
+            errors = deletion_results.get("errors", []) if isinstance(deletion_results, dict) else []
+            error_detail = "; ".join(str(e) for e in errors) or "unknown error"
+            raise RuntimeError(
+                f"Deployment '{deployment_name}' in project '{project_name}' not fully deleted: {error_detail}"
+            )
+
+        progress.complete_task(delete_task)
+        message = f"Deployment '{deployment_name}' in project '{project_name}' deleted successfully"
+        logger.info(f"Task: deployment deletion completed successfully for {project_name}/{deployment_name}")
 
         return {
-            "status": "completed" if success else "partial",
+            "status": "completed",
             "message": message,
             "project": project_name,
             "deployment": deployment_name,
 
@@ -35,6 +35,13 @@ async def handle_create_project(payload: dict, progress: Any) -> dict:
     start_time = time.time()
     project_name: str = payload["project_name"]
     pre_built_yaml: str | None = payload.get("yaml_content")
+    # Scope processing to specific deployment(s) when the caller specified them:
+    # ``deployment_name`` for a single deployment (e.g. a modal webadres edit) or
+    # ``deployment_names`` for an explicit set (e.g. domain approval affecting one
+    # or more deployments). When both are absent (full project create/update) the
+    # filter stays None and all deployments are processed as before.
+    deployment_name: str | None = payload.get("deployment_name")
+    deployment_names: list[str] | None = payload.get("deployment_names")
 
     # ------------------------------------------------------------------
     # Step 1: Validation
@@ -149,7 +156,9 @@ async def handle_create_project(payload: dict, progress: Any) -> dict:
             git_connector_for_project_files=git_connector_for_project_files,
         )
         try:
-            processing_result = await project_manager.process_project_from_git(project_file_path, progress)
+            processing_result = await project_manager.process_project_from_git(
+                project_file_path, progress, deployment_name=deployment_name, deployment_names=deployment_names
+            )
             logger.info("Project processing completed, result: %s", processing_result)
         finally:
             await project_manager.close()