Skip to content

Commit 11d76c5

Browse files
Merge origin/main in claude/sandbox-uid-override (resolve test_fixup_v2_data)
2 parents 2f87ff0 + c2bb4d4 commit 11d76c5

54 files changed

Lines changed: 3945 additions & 85 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

bootstrap/rig-system/kustomize/operations-manager/overlays/odcn-production/kustomization.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ resources:
1111
- ingress.yaml
1212
- configmap.yaml
1313
- network-policy.yaml
14+
- prometheusrule-billing.yaml
1415

1516
# Additional hostname zad.rijksapp.nl with Let's Encrypt (transition period)
1617
- issuer-letsencrypt.yaml
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: operations-manager-billing
5+
labels:
6+
app: operations-manager
7+
spec:
8+
groups:
9+
# Voorberekende billing-metric voor de gebruik-en-kostenpagina.
10+
# ODCN factureert memory als requests + clamp_min(usage - requests, 0).
11+
# Door dit per uur vast te leggen hoeft de kostenpagina geen zware
12+
# geneste per-pod aggregatie over 30 dagen meer te draaien (verzoek ODCN).
13+
# 1h-interval is bewust: het verbruik wijzigt traag en dit is een indicatie.
14+
- name: rig-billing.rules
15+
interval: 1h
16+
rules:
17+
- record: rig:namespace_memory_billed_bytes
18+
expr: |
19+
sum by (namespace) (
20+
sum by (namespace, pod) (
21+
kube_pod_resource_request{job="scheduler", namespace=~"rig-prd-.*", resource="memory"}
22+
)
23+
+ clamp_min(
24+
sum by (namespace, pod) (
25+
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", namespace=~"rig-prd-.*", container!="", image!="", prometheus!="openshift-monitoring/k8s"}
26+
)
27+
- sum by (namespace, pod) (
28+
kube_pod_resource_request{job="scheduler", namespace=~"rig-prd-.*", resource="memory"}
29+
),
30+
0
31+
)
32+
)
33+
- record: rig:tenant_memory_billed_bytes
34+
expr: sum(rig:namespace_memory_billed_bytes)

docker-compose.dev.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
services:
1616
postgres:
17-
image: postgres:15-alpine
17+
image: postgres:17-alpine
1818
container_name: opi-postgres-dev
1919
environment:
2020
POSTGRES_DB: opi

infrastructure/bootstrap/infrastructure/postgresql/database/overlays/odcn/kustomization.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,21 @@ patches:
2222
- op: add
2323
path: /spec/enablePDB
2424
value: false
25+
# Production is a shared multi-tenant instance (Keycloak, Forgejo, OPI +
26+
# all project/PR databases). The 512Mi/500m base default is a dev value;
27+
# under load (many concurrent project/PR connections + high log volume)
28+
# the CNPG instance-manager health endpoint starves and the primary gets
29+
# liveness-killed, taking the whole platform DB down on every restart.
30+
# Give production real headroom.
31+
- op: replace
32+
path: /spec/resources/limits/memory
33+
value: 2Gi
34+
- op: replace
35+
path: /spec/resources/limits/cpu
36+
value: "2"
37+
- op: replace
38+
path: /spec/resources/requests/memory
39+
value: 512Mi
40+
- op: replace
41+
path: /spec/resources/requests/cpu
42+
value: 250m

operations-manager/python/opi/connectors/argo.py

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -472,45 +472,84 @@ async def application_exists(self, app_name: str) -> bool:
472472
logger.debug(f"Application {app_name} exists: {exists}")
473473
return exists
474474

475-
async def wait_for_application_deletion(self, app_name: str, max_retries: int = 5, retry_delay: int = 3) -> bool:
475+
async def wait_for_application_deletion(
476+
self,
477+
app_name: str,
478+
max_retries: int = 5,
479+
retry_delay: int = 3,
480+
kubectl_connector: Any = None,
481+
namespace: str = "rig-prd-operations",
482+
) -> bool:
476483
"""
477484
Wait for an ArgoCD application to be fully deleted.
478485
486+
ArgoCD is queried first - it is, and ought to remain, our primary source of
487+
truth. But its API has proven untrustworthy under control-plane stress: it
488+
returns 'permission denied' to an admin caller for applications that still
489+
exist, conflating "gone", "can't see it", and "I'm stalled". So we never treat
490+
that response as "deleted". When ArgoCD's answer is anything but a confident
491+
"still exists", we double-check against the Kubernetes API, which fails
492+
honestly (the object, a clean NotFound, or a distinguishable error).
493+
479494
Args:
480495
app_name: Name of the application to wait for
481496
max_retries: Maximum number of retries
482497
retry_delay: Delay between retries in seconds
498+
kubectl_connector: Connector used to confirm absence against the Kubernetes
499+
API when ArgoCD is ambiguous. Without it, an ambiguous ArgoCD answer
500+
cannot be confirmed and deletion is reported as unconfirmed (False).
501+
namespace: Namespace holding the ArgoCD Application CR
483502
484503
Returns:
485-
True if application was deleted (or permission denied, indicating AppProject is gone),
486-
False if it still exists after max retries
504+
True only if the application is confirmed deleted, False otherwise.
487505
"""
488506
import asyncio
489507

490508
logger.info(f"Waiting for application deletion: {app_name} (max {max_retries} retries)")
491509

510+
async def _confirmed_gone_via_k8s() -> bool:
511+
# Ground-truth fallback. Only a clean NotFound (False) counts as gone;
512+
# still-present (True) or unknown (None) must not be read as deleted.
513+
if kubectl_connector is None:
514+
return False
515+
return (await kubectl_connector.argocd_application_exists(app_name, namespace)) is False
516+
492517
for attempt in range(max_retries):
493518
try:
494519
exists = await self.application_exists(app_name)
495-
if not exists:
496-
logger.info(f"Application {app_name} successfully deleted after {attempt + 1} checks")
520+
# ArgoCD reports the app gone. A clean 404 is fairly reliable, but since
521+
# the same API lies under stress we still confirm via the Kubernetes API
522+
# when we can before declaring success.
523+
if not exists and (kubectl_connector is None or await _confirmed_gone_via_k8s()):
524+
logger.info(f"Application {app_name} confirmed deleted after {attempt + 1} checks")
497525
return True
498526

499527
logger.debug(f"Application {app_name} still exists, retry {attempt + 1}/{max_retries}")
500-
if attempt < max_retries - 1: # Don't sleep on the last attempt
501-
await asyncio.sleep(retry_delay)
502528

503529
except PermissionError:
504-
# Permission denied means the AppProject was deleted before the Application.
505-
# This indicates the Application is deleted or will be garbage collected.
506-
logger.info(f"Application {app_name} - permission denied (AppProject deleted), treating as deleted")
507-
return True
530+
# FALLBACK: 'permission denied' is NOT proof the app is gone. The ArgoCD
531+
# API has proven it cannot be trusted here - it returns this to an admin
532+
# while merely stalled, for apps that still exist. Until ArgoCD can be
533+
# trusted again (and it really ought to be our single source of truth,
534+
# and may be in the future), we double-check the Kubernetes API directly.
535+
if await _confirmed_gone_via_k8s():
536+
logger.info(
537+
f"Application {app_name} confirmed deleted via Kubernetes API "
538+
f"(ArgoCD returned permission denied; not trusting it as 'deleted')"
539+
)
540+
return True
541+
logger.warning(
542+
f"Application {app_name}: ArgoCD returned permission denied but the Kubernetes API "
543+
f"shows it still present (or could not confirm) - treating as NOT deleted"
544+
)
545+
508546
except Exception as e:
509547
logger.error(f"Error checking application deletion status: {e}")
510-
if attempt < max_retries - 1:
511-
await asyncio.sleep(retry_delay)
512548

513-
logger.warning(f"Application {app_name} still exists after {max_retries} retries")
549+
if attempt < max_retries - 1: # Don't sleep on the last attempt
550+
await asyncio.sleep(retry_delay)
551+
552+
logger.warning(f"Application {app_name} NOT confirmed deleted after {max_retries} retries")
514553
return False
515554

516555

operations-manager/python/opi/connectors/kubectl.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,36 @@ async def delete_argocd_application(self, app_name: str, namespace: str = "rig-p
514514
logger.error(f"Failed to delete ArgoCD Application '{app_name}': {stderr}")
515515
return False
516516

517+
async def argocd_application_exists(self, app_name: str, namespace: str = "rig-prd-operations") -> bool | None:
518+
"""
519+
Ground-truth existence check for an ArgoCD Application CR via the Kubernetes API.
520+
521+
This is the honest fallback for the ArgoCD API, which returns an ambiguous
522+
'permission denied' when its cache is stalled (see
523+
ArgoConnector.wait_for_application_deletion). The Kubernetes API instead answers
524+
cleanly: the object exists, a definitive NotFound, or a transport error we can
525+
tell apart.
526+
527+
Args:
528+
app_name: The name of the ArgoCD Application
529+
namespace: The namespace where the Application exists (default: rig-prd-operations)
530+
531+
Returns:
532+
True - the Application CR exists
533+
False - the Application CR is confirmed absent (NotFound)
534+
None - the check itself failed; existence is unknown, so the caller must
535+
not conclude the application is gone
536+
"""
537+
stdout, stderr, code = await self._run_kubectl_command(
538+
["get", "application", app_name, "-n", namespace, "-o", "name"]
539+
)
540+
if code == 0:
541+
return True
542+
if "notfound" in stderr.lower().replace(" ", ""):
543+
return False
544+
logger.warning(f"Could not determine existence of ArgoCD Application '{app_name}': {stderr}")
545+
return None
546+
517547
async def wait_for_capsule_tenant_label(self, namespace: str, timeout: int = 30) -> bool:
518548
"""
519549
Wait for Capsule to assign the tenant label to a namespace.

operations-manager/python/opi/connectors/subdomain.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,43 @@ def is_deployment_domain_approved(
197197
return True
198198

199199

200+
def find_deployments_for_domain_item(project_data: dict[str, Any], item: dict[str, Any]) -> list[str]:
201+
"""Return names of deployments that use the domain/subdomain in an approval item.
202+
203+
Used to scope a redeploy on domain/subdomain approval to only the affected
204+
deployment(s) instead of reprocessing the whole project.
205+
206+
A deployment uses an item when its ``base-domain`` equals the item's domain.
207+
For ``subdomain`` items the deployment's ``subdomain`` must also match the
208+
item's ``name``; ``domain`` items match every deployment on that base domain.
209+
210+
Args:
211+
project_data: Parsed project YAML data
212+
item: An approval item: ``{type, domain, name, ...}`` (see _approval_items)
213+
214+
Returns:
215+
Deployment names referencing the item (may be empty).
216+
"""
217+
domain = item.get("domain", "")
218+
if not domain:
219+
return []
220+
221+
sub_name = item.get("name", "") if item.get("type") == "subdomain" else None
222+
223+
result: list[str] = []
224+
for dep in project_data.get("deployments", []):
225+
if not isinstance(dep, dict):
226+
continue
227+
if dep.get("base-domain") != domain:
228+
continue
229+
if sub_name is not None and dep.get("subdomain") != sub_name:
230+
continue
231+
name = dep.get("name")
232+
if name:
233+
result.append(name)
234+
return result
235+
236+
200237
def is_domain_format_dot_based(domain_format: str) -> bool:
201238
"""Check if a domain format uses dot notation between parts.
202239

operations-manager/python/opi/core/task_handlers_deployment.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -109,18 +109,24 @@ async def handle_delete_deployment(payload: dict, progress: Any) -> dict:
109109
deletion_results = await project_manager.delete_deployment(project_name, deployment_name, force=True)
110110

111111
success = deletion_results.get("success", False) if isinstance(deletion_results, dict) else False
112-
if success:
113-
progress.complete_task(delete_task)
114-
message = f"Deployment '{deployment_name}' in project '{project_name}' deleted successfully"
115-
else:
116-
progress.fail_task(delete_task, "Deletion completed with some errors")
117-
message = f"Deployment '{deployment_name}' deletion completed with some errors"
118-
119-
status_msg = "completed successfully" if success else "completed with some errors"
120-
logger.info(f"Task: deployment deletion {status_msg} for {project_name}/{deployment_name}")
112+
if not success:
113+
# Fail the TASK (not just the sub-task) when the delete left errors behind.
114+
# Previously this returned a "partial" result and the top-level task reported
115+
# success, so partially-failed deletes were treated as done and stale
116+
# deployments accumulated (orphaned previews). delete_deployment is idempotent,
117+
# so the caller / nightly cleaner safely retries until it converges.
118+
errors = deletion_results.get("errors", []) if isinstance(deletion_results, dict) else []
119+
error_detail = "; ".join(str(e) for e in errors) or "unknown error"
120+
raise RuntimeError(
121+
f"Deployment '{deployment_name}' in project '{project_name}' not fully deleted: {error_detail}"
122+
)
123+
124+
progress.complete_task(delete_task)
125+
message = f"Deployment '{deployment_name}' in project '{project_name}' deleted successfully"
126+
logger.info(f"Task: deployment deletion completed successfully for {project_name}/{deployment_name}")
121127

122128
return {
123-
"status": "completed" if success else "partial",
129+
"status": "completed",
124130
"message": message,
125131
"project": project_name,
126132
"deployment": deployment_name,

operations-manager/python/opi/core/task_handlers_project.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ async def handle_create_project(payload: dict, progress: Any) -> dict:
3535
start_time = time.time()
3636
project_name: str = payload["project_name"]
3737
pre_built_yaml: str | None = payload.get("yaml_content")
38+
# Scope processing to specific deployment(s) when the caller specified them:
39+
# ``deployment_name`` for a single deployment (e.g. a modal webadres edit) or
40+
# ``deployment_names`` for an explicit set (e.g. domain approval affecting one
41+
# or more deployments). When both are absent (full project create/update) the
42+
# filter stays None and all deployments are processed as before.
43+
deployment_name: str | None = payload.get("deployment_name")
44+
deployment_names: list[str] | None = payload.get("deployment_names")
3845

3946
# ------------------------------------------------------------------
4047
# Step 1: Validation
@@ -149,7 +156,9 @@ async def handle_create_project(payload: dict, progress: Any) -> dict:
149156
git_connector_for_project_files=git_connector_for_project_files,
150157
)
151158
try:
152-
processing_result = await project_manager.process_project_from_git(project_file_path, progress)
159+
processing_result = await project_manager.process_project_from_git(
160+
project_file_path, progress, deployment_name=deployment_name, deployment_names=deployment_names
161+
)
153162
logger.info("Project processing completed, result: %s", processing_result)
154163
finally:
155164
await project_manager.close()

0 commit comments

Comments
 (0)