Skip to content

Commit 76a5b5b

Browse files
committed
preload everything in gha due to rate-limiting
Signed-off-by: Zoe Blevins <zblevins@nvidia.com>
1 parent d4f6511 commit 76a5b5b

6 files changed

Lines changed: 336 additions & 20 deletions

File tree

deploy/configs/ci-integration-test.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,34 @@ services:
7777
config_store: true
7878
nautobot: true
7979

80+
images:
81+
# CI preloads third-party runtime images into Kind through the host Docker daemon.
82+
# This avoids anonymous in-cluster pulls from Docker Hub during Helm's --wait window.
83+
kind_preload_images:
84+
- docker.io/hashicorp/http-echo:1.0
85+
- docker.io/alpine/kubectl:1.35.4
86+
- docker.io/library/redis:7-alpine
87+
- docker.io/library/nats:2.10-alpine
88+
- docker.io/natsio/nats-box:0.14.3
89+
- docker.io/temporalio/server:1.29
90+
- docker.io/temporalio/admin-tools:1.29
91+
- docker.io/temporalio/ui:v2.37.4
92+
- docker.io/nginxinc/nginx-unprivileged:1.27
93+
- docker.io/envoyproxy/gateway:v1.6.5
94+
- docker.io/envoyproxy/ratelimit:c8765e89
95+
- docker.io/envoyproxy/envoy:distroless-v1.36.5
96+
- quay.io/jetstack/cert-manager-controller:v1.20.2
97+
- quay.io/jetstack/cert-manager-webhook:v1.20.2
98+
- quay.io/jetstack/cert-manager-cainjector:v1.20.2
99+
- quay.io/jetstack/cert-manager-startupapicheck:v1.20.2
100+
- quay.io/jetstack/cert-manager-acmesolver:v1.20.2
101+
- ghcr.io/cloudnative-pg/cloudnative-pg:1.29.0
102+
- ghcr.io/cloudnative-pg/postgresql:18.0-system-trixie
103+
overrides:
104+
postgresql:
105+
repository: ghcr.io/cloudnative-pg/postgresql
106+
tag: 18.0-system-trixie
107+
80108
infrastructure:
81109
gateway: envoyGateway
82110
tls: true

docs/install/index.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ nv-config-manager-installer deploy nv-config-manager-install.yaml \
173173

174174
For a fresh Kind deployment that builds local images, the local-image flags and the three operator install flags are required unless those operators are already installed and images are already available to the cluster. If the Kind cluster name is not `nv-config-manager`, pass the actual name with `--kind-cluster`.
175175

176+
In CI environments where in-cluster pulls are unreliable or rate limited, `--load-kind` can also preload selected third-party runtime images before Helm starts. Add image references under `images.kind_preload_images` in the installer YAML, or pass a comma- or space-separated override with `NVCM_KIND_PRELOAD_IMAGES`.
177+
176178
| Flag | Default | Description |
177179
| :--- | :------ | :---------- |
178180
| `--chart-dir` | `deploy/helm` | Path to the Helm chart |

installer/src/nv_config_manager_installer/deployer.py

Lines changed: 163 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -282,9 +282,16 @@ def _docker_server_platform() -> str:
282282
return f"linux/{arch}"
283283

284284

285+
def _docker_image_id(image: str) -> str:
286+
"""Return Docker's local image ID for an image reference."""
287+
result = _run(["docker", "image", "inspect", "--format", "{{.Id}}", image], check=True)
288+
return result.stdout.strip()
289+
290+
285291
_DEFAULT_IMAGE_REGISTRY = "nvcr.io/nvidian/cfa"
286292
_CNPG_OPERATOR_IMAGE_TAG = "1.29.0"
287293
_PROMETHEUS_OPERATOR_CRDS_CHART_VERSION = "28.0.1"
294+
_KIND_PRELOAD_IMAGES_ENV = "NVCM_KIND_PRELOAD_IMAGES"
288295
_KIND_PRELOAD_IMAGES = (LOADER_POD_IMAGE,)
289296
_KIND_PRELOAD_PLATFORM_IMAGES: dict[str, dict[str, str]] = {
290297
LOADER_POD_IMAGE: {
@@ -299,6 +306,36 @@ def _kind_preload_source_image(image: str, platform_name: str) -> str:
299306
return _KIND_PRELOAD_PLATFORM_IMAGES.get(image, {}).get(platform_name, image)
300307

301308

309+
def _env_kind_preload_images() -> list[str]:
310+
"""Return additional Kind preload images from the environment."""
311+
value = os.environ.get(_KIND_PRELOAD_IMAGES_ENV, "")
312+
return [part for part in value.replace(",", " ").split() if part]
313+
314+
315+
def _kind_preload_images(config: NVConfigManagerInstallConfig) -> list[str]:
316+
"""Return the effective ordered list of images to preload into Kind."""
317+
images: list[str] = []
318+
seen: set[str] = set()
319+
for image in (
320+
*_KIND_PRELOAD_IMAGES,
321+
*config.images.kind_preload_images,
322+
*_env_kind_preload_images(),
323+
):
324+
image = image.strip()
325+
if not image or image in seen:
326+
continue
327+
seen.add(image)
328+
images.append(image)
329+
return images
330+
331+
332+
def _kind_node_names(cluster: str) -> list[str]:
333+
"""Return Docker container names for nodes in a Kind cluster."""
334+
result = _run(["kind", "get", "nodes", "--name", cluster], check=True)
335+
nodes = [line.strip() for line in result.stdout.splitlines() if line.strip()]
336+
return nodes or [f"{cluster}-control-plane"]
337+
338+
302339
def _strip_image_registry(repository: str) -> str:
303340
"""Return repository path with any registry host removed."""
304341
first, sep, rest = repository.partition("/")
@@ -457,6 +494,90 @@ def _run_logged(
457494
return subprocess.CompletedProcess(cmd, proc.returncode, stdout_text, stderr_text)
458495

459496

497+
def _run_logged_pipe(
498+
source_cmd: list[str],
499+
sink_cmd: list[str],
500+
step: DeployStep,
501+
callback: DeployCallback,
502+
*,
503+
check: bool = True,
504+
timeout: int | None = 600,
505+
) -> subprocess.CompletedProcess[str]:
506+
"""Run source_cmd with stdout piped to sink_cmd, streaming command output."""
507+
source_str = " ".join(source_cmd[:4]) + ("..." if len(source_cmd) > 4 else "")
508+
sink_str = " ".join(sink_cmd[:4]) + ("..." if len(sink_cmd) > 4 else "")
509+
callback.on_log(f"$ {source_str} | {sink_str}")
510+
511+
source = subprocess.Popen(source_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
512+
sink = subprocess.Popen(
513+
sink_cmd,
514+
stdin=source.stdout,
515+
stdout=subprocess.PIPE,
516+
stderr=subprocess.PIPE,
517+
)
518+
if source.stdout:
519+
source.stdout.close()
520+
521+
stdout_lines: list[str] = []
522+
stderr_lines: list[str] = []
523+
source_stderr_lines: list[str] = []
524+
deadline = time.monotonic() + timeout if timeout else None
525+
sel = selectors.DefaultSelector()
526+
try:
527+
if source.stderr:
528+
sel.register(source.stderr, selectors.EVENT_READ, "source-stderr")
529+
if sink.stdout:
530+
sel.register(sink.stdout, selectors.EVENT_READ, "sink-stdout")
531+
if sink.stderr:
532+
sel.register(sink.stderr, selectors.EVENT_READ, "sink-stderr")
533+
534+
while sel.get_map():
535+
for proc, cmd in ((source, source_cmd), (sink, sink_cmd)):
536+
_check_deadline(deadline, proc, cmd, timeout)
537+
wait = None if deadline is None else max(0.0, deadline - time.monotonic())
538+
for key, _ in sel.select(timeout=wait):
539+
line_bytes = key.fileobj.readline() # type: ignore[union-attr]
540+
if not line_bytes:
541+
sel.unregister(key.fileobj)
542+
continue
543+
line = line_bytes.decode(errors="replace").rstrip("\n")
544+
step.output.append(line)
545+
callback.on_log(line)
546+
if key.data == "sink-stdout":
547+
stdout_lines.append(line)
548+
elif key.data == "sink-stderr":
549+
stderr_lines.append(line)
550+
else:
551+
source_stderr_lines.append(line)
552+
553+
source.wait()
554+
sink.wait()
555+
except BaseException:
556+
source.kill()
557+
sink.kill()
558+
source.wait()
559+
sink.wait()
560+
raise
561+
finally:
562+
sel.close()
563+
564+
stdout_text = "\n".join(stdout_lines)
565+
stderr_text = "\n".join(stderr_lines)
566+
source_stderr_text = "\n".join(source_stderr_lines)
567+
568+
if check and sink.returncode != 0:
569+
raise subprocess.CalledProcessError(sink.returncode, sink_cmd, stdout_text, stderr_text)
570+
if check and source.returncode != 0:
571+
raise subprocess.CalledProcessError(
572+
source.returncode,
573+
source_cmd,
574+
"",
575+
source_stderr_text,
576+
)
577+
578+
return subprocess.CompletedProcess(sink_cmd, sink.returncode, stdout_text, stderr_text)
579+
580+
460581
def _short_pod_text(value: str, *, limit: int = 140) -> str:
461582
"""Return a compact single-line pod status detail."""
462583
value = " ".join(value.split())
@@ -1193,10 +1314,13 @@ def _load_kind(self) -> None:
11931314
timeout=300,
11941315
)
11951316
platform_name = _docker_server_platform()
1196-
for img in _KIND_PRELOAD_IMAGES:
1317+
for img in _kind_preload_images(self.config):
11971318
# Pull arch-scoped official images when available. Docker can keep
11981319
# multi-platform tags as manifest lists even after --platform,
1199-
# which makes kind's image import fail on missing platform content.
1320+
# which makes kind's default --all-platforms import fail on missing
1321+
# platform content. Import helper images directly into node
1322+
# containerd with the selected platform instead. Avoid ctr's
1323+
# digest refs here; the tag is what Kubernetes needs to resolve.
12001324
source_img = _kind_preload_source_image(img, platform_name)
12011325
self.callback.on_log(
12021326
f"Pulling helper image {source_img} for platform {platform_name}..."
@@ -1207,23 +1331,50 @@ def _load_kind(self) -> None:
12071331
self.callback,
12081332
timeout=300,
12091333
)
1210-
if source_img != img:
1211-
self.callback.on_log(f"Tagging helper image {source_img} as {img}...")
1212-
_run_logged(
1213-
["docker", "tag", source_img, img],
1214-
step,
1215-
self.callback,
1216-
timeout=300,
1217-
)
1218-
self.callback.on_log(f"Loading helper image {img} into Kind cluster {cluster}...")
1334+
image_id = _docker_image_id(source_img)
1335+
self.callback.on_log(f"Tagging selected-platform helper image {image_id} as {img}...")
12191336
_run_logged(
1220-
["kind", "load", "docker-image", img, "--name", cluster],
1337+
["docker", "tag", image_id, img],
12211338
step,
12221339
self.callback,
12231340
timeout=300,
12241341
)
1342+
self._load_kind_helper_image(img, cluster, platform_name, step)
12251343
self._finish_step(step)
12261344

1345+
def _load_kind_helper_image(
1346+
self,
1347+
image: str,
1348+
cluster: str,
1349+
platform_name: str,
1350+
step: DeployStep,
1351+
) -> None:
1352+
"""Load a helper image into Kind node containerd for one platform."""
1353+
nodes = _kind_node_names(cluster)
1354+
for node in nodes:
1355+
self.callback.on_log(f"Loading helper image {image} into Kind node {node}...")
1356+
_run_logged_pipe(
1357+
["docker", "save", image],
1358+
[
1359+
"docker",
1360+
"exec",
1361+
"--privileged",
1362+
"-i",
1363+
node,
1364+
"ctr",
1365+
"--namespace=k8s.io",
1366+
"images",
1367+
"import",
1368+
"--platform",
1369+
platform_name,
1370+
"--snapshotter=overlayfs",
1371+
"-",
1372+
],
1373+
step,
1374+
self.callback,
1375+
timeout=300,
1376+
)
1377+
12271378
def _operator_bundle_root(self) -> Path | None:
12281379
"""Return the airgap bundle root if local charts/manifests are available."""
12291380
chart_dir = Path(self.options.chart_dir).expanduser()

installer/src/nv_config_manager_installer/schema.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,7 @@ class ImagesConfig(BaseModel):
580580
registry: str = "nvcr.io/nvidian/cfa"
581581
tag: str = ""
582582
pull_policy: str = "IfNotPresent"
583+
kind_preload_images: list[str] = Field(default_factory=list)
583584
pull_secret: ImagePullSecret = Field(default_factory=ImagePullSecret)
584585
overrides: dict[str, ImageOverride] = Field(default_factory=dict)
585586

@@ -927,9 +928,11 @@ def _prune_ztp_storage(ztp_storage: dict[str, Any]) -> None:
927928

928929

929930
def _prune_images(images: dict[str, Any]) -> None:
931+
if not images.get("kind_preload_images"):
932+
images.pop("kind_preload_images", None)
930933
if images.get("source") != ImageSource.LOCAL.value:
931934
return
932-
_replace_with_keys(images, {"source"})
935+
_replace_with_keys(images, {"source", "kind_preload_images"})
933936

934937

935938
def _prune_redfish(redfish: dict[str, Any]) -> None:

0 commit comments

Comments
 (0)