Skip to content

Commit fd0d20b

Browse files
authored
Merge branch 'main' into feat/enhance-evidence-collection
2 parents 3d149db + 7f4da79 commit fd0d20b

File tree

19 files changed

+691
-804
lines changed

19 files changed

+691
-804
lines changed

.github/copilot-instructions.md

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -872,14 +872,9 @@ make e2e-tilt
872872
### Agent Deployment Pattern
873873

874874
```bash
875-
# Deploy agent for automated snapshots
876-
kubectl apply -f deployments/aicr-agent/1-deps.yaml
877-
kubectl apply -f deployments/aicr-agent/2-job.yaml
878-
879-
# Check logs
880-
kubectl logs -n gpu-operator job/aicr
875+
# Deploy agent for automated snapshots (CLI handles RBAC + Job lifecycle)
876+
aicr snapshot --output snapshot.yaml
881877
882-
# Get snapshot from ConfigMap
883-
kubectl get configmap aicr-snapshot -n gpu-operator \
884-
-o jsonpath='{.data.snapshot\.yaml}' > snapshot.yaml
878+
# Or write to ConfigMap
879+
aicr snapshot --output cm://gpu-operator/aicr-snapshot
885880
```

demos/e2e.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ curl -s "https://aicr.dgxc.io/v1/recipe?service=eks&accelerator=l40&intent=train
9696
9797
```shell
9898
aicr snapshot \
99-
--deploy-agent \
10099
--namespace gpu-operator \
101100
--node-selector nodeGroup=customer-gpu \
102101
--output cm://gpu-operator/aicr-snapshot

deployments/aicr-agent/1-deps.yaml

Lines changed: 0 additions & 77 deletions
This file was deleted.

deployments/aicr-agent/2-job.yaml

Lines changed: 0 additions & 100 deletions
This file was deleted.

docs/contributor/cli.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ subjects:
395395
- RBAC RoleBinding must reference correct namespace
396396
- ConfigMap automatically created if doesn't exist
397397
- Supports update pattern (overwrite existing snapshots)
398-
- For complete examples, see [deployments/aicr-agent/](../../../deployments/aicr-agent/)
398+
- RBAC and Job resources are created programmatically by `pkg/k8s/agent`
399399
```
400400

401401
### Recipe Command: `pkg/cli/recipe.go`

docs/integrator/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ This section is for integrators who:
2525

2626
```shell
2727
# Deploy API server to Kubernetes
28-
kubectl apply -k https://github.com/NVIDIA/aicr/deployments/aicrd
28+
kubectl apply -k https://github.com/NVIDIA/aicr/deploy/aicrd
2929

3030
# Generate recipe via API
3131
curl "http://aicrd.aicr.svc/v1/recipe?service=eks&accelerator=h100"

docs/integrator/automation.md

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,7 @@ jobs:
4040

4141
- name: Deploy AICR Agent
4242
run: |
43-
kubectl apply -f https://raw.githubusercontent.com/nvidia/aicr/main/deployments/aicr-agent/1-deps.yaml
44-
kubectl apply -f https://raw.githubusercontent.com/nvidia/aicr/main/deployments/aicr-agent/2-job.yaml
43+
aicr snapshot --output cm://gpu-operator/aicr-snapshot --timeout 300s
4544
4645
- name: Wait for completion
4746
run: |
@@ -88,9 +87,7 @@ capture_snapshot:
8887
stage: snapshot
8988
image: bitnami/kubectl:latest
9089
script:
91-
- kubectl apply -f deployments/aicr-agent/2-job.yaml
92-
- kubectl wait --for=condition=complete job/aicr -n gpu-operator
93-
- kubectl get configmap aicr-snapshot -n gpu-operator -o jsonpath='{.data.snapshot\.yaml}' > snapshot.yaml
90+
- aicr snapshot --output snapshot.yaml --timeout 300s
9491
artifacts:
9592
paths:
9693
- snapshot.yaml
@@ -225,9 +222,7 @@ for cluster_config in "${CLUSTERS[@]}"; do
225222
kubectl config use-context "$CLUSTER"
226223

227224
# Capture snapshot
228-
kubectl apply -f deployments/aicr-agent/2-job.yaml
229-
kubectl wait --for=condition=complete --timeout=300s job/aicr -n gpu-operator
230-
kubectl get configmap aicr-snapshot -n gpu-operator -o jsonpath='{.data.snapshot\.yaml}' > "snapshot-${CLUSTER}.yaml"
225+
aicr snapshot --output "snapshot-${CLUSTER}.yaml" --timeout 300s
231226

232227
# Generate recipe (can use ConfigMap directly or file)
233228
# Option 1: Use ConfigMap
@@ -390,32 +385,15 @@ done
390385
```hcl
391386
# modules/aicr-agent/main.tf
392387
393-
resource "kubectl_manifest" "aicr_deps" {
394-
yaml_body = file("${path.module}/manifests/1-deps.yaml")
395-
}
396-
397-
resource "kubectl_manifest" "aicr_job" {
398-
yaml_body = templatefile("${path.module}/manifests/2-job.yaml", {
399-
node_selector = var.node_selector
400-
tolerations = var.tolerations
401-
image_version = var.image_version
402-
})
403-
404-
depends_on = [kubectl_manifest.aicr_deps]
405-
}
406-
407-
# Wait for job completion and get snapshot from ConfigMap
408-
resource "null_resource" "wait_for_snapshot" {
388+
# Deploy agent and capture snapshot using CLI
389+
resource "null_resource" "capture_snapshot" {
409390
provisioner "local-exec" {
410391
command = <<-EOT
411-
kubectl wait --for=condition=complete \
412-
--timeout=300s job/aicr -n gpu-operator
413-
kubectl get configmap aicr-snapshot -n gpu-operator \
414-
-o jsonpath='{.data.snapshot\.yaml}' > ${var.snapshot_output}
392+
aicr snapshot \
393+
--output ${var.snapshot_output} \
394+
--timeout 300s
415395
EOT
416396
}
417-
418-
depends_on = [kubectl_manifest.aicr_job]
419397
}
420398
421399
# Generate recipe (can use ConfigMap directly)

docs/integrator/kubernetes-deployment.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Deploy the AICR API Server in your Kubernetes cluster for self-hosted recipe gen
4141
kubectl create namespace aicr
4242

4343
# Deploy API server
44-
kubectl apply -k https://github.com/NVIDIA/aicr/deployments/aicrd
44+
kubectl apply -k https://github.com/NVIDIA/aicr/deploy/aicrd
4545

4646
# Check deployment
4747
kubectl get pods -n aicr

0 commit comments

Comments
 (0)