diff --git a/.github/actions/playwright-test/action.yml b/.github/actions/playwright-test/action.yml index 973c08b35b..d734e302a0 100644 --- a/.github/actions/playwright-test/action.yml +++ b/.github/actions/playwright-test/action.yml @@ -65,7 +65,7 @@ runs: # Extract version from package.json5 (single source of truth) # package.json5 has: "@playwright/test": "=1.58.1" # Container tag format is: v1.58.1-noble - PKG_VERSION=$(grep -oP '"@playwright/test":\s*"=?\K[0-9.]+' "${WORKING_DIRECTORY}/package.json5") + PKG_VERSION=$(awk -F'"' '/"@playwright\/test"/ {print $4}' "${WORKING_DIRECTORY}/package.json5" | tr -d '=\" ') if [[ -z "$PKG_VERSION" ]]; then echo "::error::Failed to extract Playwright version from package.json5" exit 1 diff --git a/.github/analyze.py b/.github/analyze.py new file mode 100644 index 0000000000..d5e8ba9c71 --- /dev/null +++ b/.github/analyze.py @@ -0,0 +1,15 @@ +import glob +import pathlib +import subprocess + +if __name__ == '__main__': + first = None + # pyrefly: ignore # bad-assignment + for line in glob.glob("**/partial-body.html", recursive=True): + file = pathlib.Path(line) + utils = file.parent.parent + if first is None: + first = utils + else: + print() + subprocess.check_call(f"diff --recursive --no-dereference '{first}' '{utils}'", shell=True) diff --git a/.github/gh.http b/.github/gh.http new file mode 100644 index 0000000000..ba8c54a060 --- /dev/null +++ b/.github/gh.http @@ -0,0 +1,8 @@ +# https://docs.github.com/en/rest/using-the-rest-api/troubleshooting-the-rest-api?apiVersion=2022-11-28#resource-not-accessible + +### GET request to example server +POST https://api.github.com/repos/opendatahub-io/notebooks/issues/1857/labels +Authorization: Bearer +Content-Type: application/json + +### diff --git a/.github/workflows/build-notebooks-TEMPLATE.yaml b/.github/workflows/build-notebooks-TEMPLATE.yaml index d72a3af8ca..31f718f161 100644 --- a/.github/workflows/build-notebooks-TEMPLATE.yaml +++ b/.github/workflows/build-notebooks-TEMPLATE.yaml @@ -396,7 +396,9 @@ jobs: pip3 install --quiet --break-system-packages pyyaml command -v uv &>/dev/null || pip3 install --quiet --break-system-packages uv - PREFETCH_ARGS="--component-dir $COMPONENT_DIR --flavor $FLAVOR" + ARCH="${{ inputs.platform }}" + ARCH="${ARCH#*/}" + PREFETCH_ARGS="--component-dir $COMPONENT_DIR --flavor $FLAVOR --arch $ARCH" # AIPCC builds use RHEL-based base images, so prefetch from the rhds # variant to avoid RPM conflicts (e.g. openssl-fips-provider vs # openssl-fips-provider-so). diff --git a/Makefile b/Makefile index 77ae3097db..3a49ccb54c 100644 --- a/Makefile +++ b/Makefile @@ -98,17 +98,20 @@ define build_image # Dockerfile runs). The mount hides the base image's default repos. # Konflux buildah-oci-ta task mounts YUM_REPOS_D_FETCHED at YUM_REPOS_D_TARGET (/etc/yum.repos.d). # See https://github.com/konflux-ci/build-definitions/blob/main/task/buildah-oci-ta/ -$(eval CACHI2_VOLUME := $(if $(and $(wildcard cachi2/output),$(wildcard $(BUILD_DIR)prefetch-input)),\ - --volume $(ROOT_DIR)cachi2/output:/cachi2/output:Z \ - --volume $(ROOT_DIR)cachi2/output/deps/rpm/$(RPM_ARCH)/repos.d/:/etc/yum.repos.d/:Z,)) + $(eval COMPONENT_DIR_STR := $(patsubst %/,%,$(BUILD_DIR))) + $(eval CACHI2_HASH := $(shell python3 -c "import hashlib; print(hashlib.md5('$(COMPONENT_DIR_STR)'.encode()).hexdigest())")) + $(eval CACHI2_DIR := cachi2/output/$(CACHI2_HASH)) + $(eval CACHI2_VOLUME := $(if $(and $(wildcard $(CACHI2_DIR)),$(wildcard $(BUILD_DIR)prefetch-input)),\ + --volume $(ROOT_DIR)$(CACHI2_DIR):/cachi2/output:Z \ + --volume $(ROOT_DIR)$(CACHI2_DIR)/deps/rpm/$(RPM_ARCH)/repos.d/:/etc/yum.repos.d/:Z,)) $(info # Building $(IMAGE_NAME) using $(DOCKERFILE_NAME) with $(CONF_FILE) and $(BUILD_ARGS)...) - @if [ -d '$(BUILD_DIR)prefetch-input' ] && [ ! -d cachi2/output ]; then \ - echo "Prefetch required for hermetic build. Run: scripts/lockfile-generators/prefetch-all.sh --component-dir $(patsubst %/,%,$(BUILD_DIR)) -- see scripts/lockfile-generators/README.md"; \ + @if [ -d '$(BUILD_DIR)prefetch-input' ] && [ ! -d '$(CACHI2_DIR)' ]; then \ + echo "Prefetch required for hermetic build. Run: scripts/lockfile-generators/prefetch-all.sh --component-dir $(COMPONENT_DIR_STR) -- see scripts/lockfile-generators/README.md"; \ exit 1; \ fi - @if [ -d cachi2/output ] && [ -d '$(BUILD_DIR)prefetch-input' ] && [ ! -d 'cachi2/output/deps/rpm/$(RPM_ARCH)/repos.d' ]; then \ - echo "Missing RPM repos for $(RPM_ARCH). Re-run: scripts/lockfile-generators/prefetch-all.sh --component-dir $(patsubst %/,%,$(BUILD_DIR))"; \ + @if [ -d '$(CACHI2_DIR)' ] && [ -d '$(BUILD_DIR)prefetch-input' ] && [ ! -d '$(CACHI2_DIR)/deps/rpm/$(RPM_ARCH)/repos.d' ]; then \ + echo "Missing RPM repos for $(RPM_ARCH). Re-run: scripts/lockfile-generators/prefetch-all.sh --component-dir $(COMPONENT_DIR_STR)"; \ exit 1; \ fi $(ROOT_DIR)/scripts/sandbox.py --dockerfile '$(2)' --platform '$(BUILD_ARCH)' -- \ diff --git a/docs/hermetic-guide.md b/docs/hermetic-guide.md index b6e433073d..99008746ed 100644 --- a/docs/hermetic-guide.md +++ b/docs/hermetic-guide.md @@ -40,7 +40,7 @@ Benefits: ┌──────────────────────────────────────────────────────────────────┐ │ Prefetch (before podman build) │ │ │ -│ Local/GHA: prefetch-all.sh → cachi2/output/deps/ │ +│ Local/GHA: prefetch-all.sh → cachi2/output//deps/ │ │ Konflux: prefetch-dependencies Tekton task │ └──────────────────┬───────────────────────────────────────────────┘ │ @@ -216,9 +216,12 @@ detailed usage. The `build_image` macro was updated to auto-detect hermetic builds: ```makefile -$(eval CACHI2_VOLUME := $(if $(and $(wildcard cachi2/output),$(wildcard $(BUILD_DIR)prefetch-input)),\ - --volume $(ROOT_DIR)cachi2/output:/cachi2/output:Z \ - --volume $(ROOT_DIR)cachi2/output/deps/rpm/$(RPM_ARCH)/repos.d/:/etc/yum.repos.d/:Z,)) + $(eval COMPONENT_DIR_STR := $(patsubst %/,%,$(BUILD_DIR))) + $(eval CACHI2_HASH := $(shell python3 -c "import hashlib; print(hashlib.md5('$(COMPONENT_DIR_STR)'.encode()).hexdigest())")) + $(eval CACHI2_DIR := cachi2/output/$(CACHI2_HASH)) + $(eval CACHI2_VOLUME := $(if $(and $(wildcard $(CACHI2_DIR)),$(wildcard $(BUILD_DIR)prefetch-input)),\ + --volume $(ROOT_DIR)$(CACHI2_DIR):/cachi2/output:Z \ + --volume $(ROOT_DIR)$(CACHI2_DIR)/deps/rpm/$(RPM_ARCH)/repos.d/:/etc/yum.repos.d/:Z,)) ``` This evaluates per-target: only targets with both `cachi2/output/` and a diff --git a/docs/learnings/hermetic-build-architecture_.md b/docs/learnings/hermetic-build-architecture_.md new file mode 100644 index 0000000000..e56b91fe51 --- /dev/null +++ b/docs/learnings/hermetic-build-architecture_.md @@ -0,0 +1,182 @@ +# Hermetic Build Architecture for Codeserver + +## Overview + +The codeserver workbench (`codeserver/ubi9-python-3.12`) uses a fully hermetic build +where all dependencies (RPMs, npm packages, Python wheels, generic tarballs) are +prefetched before the Docker build runs. The build operates without network access. + +## Build Chain + +```text +prefetch-all.sh → populates cachi2/output//deps/ +Makefile → detects cachi2/output/ → injects --volume into podman build +sandbox.py → creates minimal build context from Dockerfile COPY/ADD directives +podman build → runs Dockerfile with /cachi2/output/ mounted +``` + +### prefetch-all.sh + +Orchestrates five lockfile generators in sequence: + +| Step | Generator | Input | Output | +|------|-----------|-------|--------| +| 1 | `create-artifact-lockfile.py` | `artifacts.in.yaml` | `cachi2/output//deps/generic/` (GPG keys, nfpm, node headers, oc client, VS Code extensions) | +| 2 | `create-requirements-lockfile.sh` | `pyproject.toml` | `cachi2/output//deps/pip/` (Python wheels) | +| 3 | `download-npm.sh` | `package-lock.json` files | `cachi2/output//deps/npm/` (npm tarballs) | +| 4 | `hermeto-fetch-rpm.sh` | `rpms.lock.yaml` | `cachi2/output//deps/rpm/{arch}/` (RPMs + repo metadata) | +| 5 | `create-go-lockfile.sh` | `go.mod` (via git submodule) | `cachi2/output//deps/gomod/` (Go modules) | + +Variants are selected via `--rhds` flag: +- Default (`odh`): uses CentOS Stream + UBI repos (no subscription needed) +- `--rhds`: uses RHEL subscription repos (needs `--activation-key` and `--org`) + +### Makefile auto-detection + +```makefile +$(eval COMPONENT_DIR_STR := $(patsubst %/,%,$(BUILD_DIR))) +$(eval CACHI2_HASH := $(shell python3 -c "import hashlib; print(hashlib.md5('$(COMPONENT_DIR_STR)'.encode()).hexdigest())")) +$(eval CACHI2_DIR := cachi2/output/$(CACHI2_HASH)) +$(eval CACHI2_VOLUME := $(if $(and $(wildcard $(CACHI2_DIR)),$(wildcard $(BUILD_DIR)prefetch-input)),\ + --volume $(ROOT_DIR)$(CACHI2_DIR):/cachi2/output:Z \ + --volume $(ROOT_DIR)$(CACHI2_DIR)/deps/rpm/$(RPM_ARCH)/repos.d/:/etc/yum.repos.d/:Z,)) +``` + +When both `cachi2/output//` and `/prefetch-input/` exist, the Makefile +automatically mounts the per-component prefetched dependencies into the build. +The `` is the MD5 of the component directory name, allowing concurrent +builds of different components without collisions. The second mount overlays +`/etc/yum.repos.d/` with hermeto-generated repos, making local builds behave +like Konflux (repos are already in place when the Dockerfile runs). + +### sandbox.py + +Wraps `podman build` by creating a minimal build context: +1. Parses the Dockerfile using `bin/buildinputs` (Go tool, Dockerfile → LLB → JSON) +2. Identifies all files referenced in COPY/ADD directives +3. Creates a temporary directory with only those files +4. Passes `{}` placeholder to podman which gets replaced with the tmpdir path + +sandbox.py does NOT modify volumes, build args, or repos — it only manages the +build context. + +## cachi2/output Directory Structure + +After prefetching, each component gets its own namespaced directory under +`cachi2/output//` (where `` is the MD5 of the component directory +name, e.g. `cachi2/output/a1b2c3.../`). This prevents collisions when building +multiple components in parallel. + +```text +cachi2/output/ +└── / # per-component namespace (MD5 of component dir) + ├── deps/ + │ ├── rpm/ + │ │ ├── x86_64/ + │ │ │ ├── / # RPM files + repodata/ + │ │ │ └── repos.d/ # Generated .repo files with file:// URLs + │ │ ├── aarch64/ + │ │ ├── ppc64le/ + │ │ └── s390x/ + │ ├── npm/ # npm tarballs + │ ├── pip/ # Python wheels + │ └── generic/ # GPG keys, tarballs, etc. + ├── bom.json + └── .build-config.json +``` + +The Makefile mounts `cachi2/output//` at `/cachi2/output/` inside the +container, so the Dockerfile always sees `/cachi2/output/deps/...` regardless +of which component is being built. + +Key detail: when `rpms.in.yaml` declares `moduleEnable: [nodejs:22]`, hermeto +downloads module metadata (`modules.yaml`) alongside the RPMs and includes it +in the generated repodata. This allows `dnf module enable nodejs:22` to work +with the hermeto repos. Both our `hermeto-fetch-rpm.sh` wrapper and Konflux's +`prefetch-dependencies-oci-ta` task produce repos with this metadata. + +## Three Build Environments + +### Local development + +```bash +scripts/lockfile-generators/prefetch-all.sh --component-dir codeserver/ubi9-python-3.12 +make codeserver-ubi9-python-3.12 +``` + +Makefile detects `cachi2/output//` and auto-injects the volume mount. + +### GitHub Actions + +The TEMPLATE workflow (`build-notebooks-TEMPLATE.yaml`) handles it transparently: + +1. **Prefetch step**: runs `prefetch-all.sh`, outputs `EXTRA_BUILD_ARGS` with + volume mount +2. **Build step**: runs `make` with `CONTAINER_BUILD_CACHE_ARGS` containing + the volume mount +3. For subscription builds (AIPCC), passes `--rhds --activation-key ... --org ...` + to use the RHDS variant lockfiles + +### Konflux (Tekton) + +1. PipelineRun YAML declares `prefetch-input` entries pointing to lockfiles +2. cachi2's `prefetch-dependencies` task downloads everything using hermeto +3. Build task mounts deps at `/cachi2/output/` automatically +4. Network isolation enforced at the pipeline level + +All three environments produce the same `/cachi2/output/deps/` layout inside +the container because they all use hermeto under the hood for RPM prefetching. +On the host, local/GHA builds use `cachi2/output//deps/` while Konflux +uses its own staging directory. + +## Variant Directories (ODH vs RHDS) + +Lockfiles are organized into two variant directories under `prefetch-input/`: + +```text +prefetch-input/ +├── odh/ # upstream (CentOS Stream + UBI repos) +│ ├── rpms.in.yaml +│ ├── rpms.lock.yaml +│ ├── artifacts.in.yaml +│ └── artifacts.lock.yaml +├── rhds/ # downstream (RHEL subscription repos) +│ ├── rpms.in.yaml +│ ├── rpms.lock.yaml +│ ├── artifacts.in.yaml +│ └── artifacts.lock.yaml +├── repos/ # shared DNF repo definitions +├── code-server/ # git submodule (vendored source) +└── patches/ # build patches for offline operation +``` + +ODH uses CentOS Stream packages; RHDS uses RHEL packages. The choice matters +because base images differ: ODH uses a c9s base, AIPCC uses a RHEL base. +Mixing variants causes RPM conflicts (see openssl-fips-provider-conflict.md). + +## Dockerfile Structure + +The Dockerfile is multi-stage with 5 stages: + +| Stage | Purpose | +|-------|---------| +| `rpm-base` | Builds code-server from source into an RPM | +| `whl-cache` | Installs Python wheels, exports compiled C-extension wheels for ppc64le/s390x | +| `cpu-base` | Installs OS packages + tools (oc client, micropipenv, uv) | +| `codeserver` | Final image (code-server + nginx + Python packages) | +| `tests` | Smoke test stage | + +Each stage that runs `dnf install` needs repos configured. Repos are injected +by the infrastructure, not by the Dockerfile: + +- **Local/GHA**: The Makefile volume-mounts `repos.d/` at `/etc/yum.repos.d/`, + overlaying the base image's default repos. +- **Konflux**: The `buildah-oci-ta` task volume-mounts `YUM_REPOS_D_FETCHED` + at `/etc/yum.repos.d/` in the same way. + +Both environments replace the base image's default repos. For targets that +need nodejs (codeserver), `rpms.in.yaml` declares `moduleEnable: [nodejs:22]`, +which makes hermeto include module metadata in the repodata. The Dockerfile +runs `dnf module enable nodejs:22 -y` to activate the module stream. + +No `LOCAL_BUILD` build arg, no if/else branching, no `rm -f` or `cp` of repos. diff --git a/scripts/lockfile-generators/README.md b/scripts/lockfile-generators/README.md index 3e1492419a..2e07246272 100644 --- a/scripts/lockfile-generators/README.md +++ b/scripts/lockfile-generators/README.md @@ -44,14 +44,14 @@ All scripts must be run from the **repository root**. **For most local and CI use, this is the main script you need to run.** `prefetch-all.sh` orchestrates all five lockfile generators in the correct -order, downloading dependencies into `cachi2/output/deps/`. After running it, -the Makefile auto-detects `cachi2/output/` and passes `--volume` to +order, downloading dependencies into `cachi2/output//deps/` (where `` is the MD5 hash of the component directory name to allow concurrent local builds). After running it, +the Makefile auto-detects the component's `cachi2/output/` directory and passes `--volume` to `podman build`. ```bash # Upstream ODH (default variant, CentOS Stream base, no subscription): scripts/lockfile-generators/prefetch-all.sh \ - --component-dir codeserver/ubi9-python-3.12 + --component-dir codeserver/ubi9-python-3.12 --arch aarch64 # Downstream RHDS (with RHEL subscription for cdn.redhat.com RPMs): scripts/lockfile-generators/prefetch-all.sh \ @@ -78,6 +78,7 @@ gmake codeserver-ubi9-python-3.12 BUILD_ARCH=linux/arm64 PUSH_IMAGES=no | `--component-dir DIR` | Component directory (required), e.g. `codeserver/ubi9-python-3.12` | | `--rhds` | Use downstream (RHDS) lockfiles instead of upstream (ODH, the default) | | `--flavor NAME` | Lock file flavor (default: `cpu`) | +| `--arch ARCH` | Target architecture to filter downloads (default: host architecture) | | `--activation-key KEY` | Red Hat activation key for RHEL RPMs (optional) | | `--org ORG` | Red Hat organization ID for RHEL RPMs (optional) | @@ -152,7 +153,7 @@ internally. Option 6 (Git submodule) is a manual setup. | Helper | Used by | Purpose | |--------|---------|---------| | `helpers/pylock-to-requirements.py` | pip | Convert `pylock..toml` (PEP 751) to pip-compatible `requirements..txt` with `--hash` lines. | -| `helpers/download-pip-packages.py` | pip | Standalone pip downloader: downloads wheels/sdists from a `requirements.txt` (with `--hash` lines) into `cachi2/output/deps/pip/`. Not called by `create-requirements-lockfile.sh` (which has its own inline download from pylock.toml). | +| `helpers/download-pip-packages.py` | pip | Pip downloader: downloads wheels/sdists from a `requirements.txt` (with `--hash` lines) into `cachi2/output//deps/pip/`. Called by `create-requirements-lockfile.sh --download`. Supports `--arch` filtering and parallel downloads. | | `helpers/download-rpms.sh` | RPM | Download RPMs from `rpms.lock.yaml` via `wget` into `cachi2/output/deps/rpm/` and create DNF repo metadata. Standalone alternative to `hermeto-fetch-rpm.sh`. | | `helpers/hermeto-fetch-rpm.sh` | RPM | Download RPMs from `rpms.lock.yaml` using [Hermeto](https://github.com/hermetoproject/hermeto) in a container. Handles RHEL entitlement cert extraction for `cdn.redhat.com` auth. Called by `create-rpm-lockfile.sh --download`. | | `helpers/hermeto-fetch-npm.sh` | npm | Alternative npm fetcher using [Hermeto](https://github.com/hermetoproject/hermeto) in a container. | diff --git a/scripts/lockfile-generators/create-artifact-lockfile.py b/scripts/lockfile-generators/create-artifact-lockfile.py index c105e0e86c..97a944cccc 100755 --- a/scripts/lockfile-generators/create-artifact-lockfile.py +++ b/scripts/lockfile-generators/create-artifact-lockfile.py @@ -27,14 +27,14 @@ import argparse import hashlib import subprocess -import sys +import sys, os from pathlib import Path from typing import Any, Optional import yaml # Constants -CACHE_BASE_DIR = Path("cachi2/output/deps/generic") +CACHE_BASE_DIR = Path(os.environ.get("CACHI2_OUT_DIR", "cachi2/output")) / "deps" / "generic" METADATA_VERSION = "1.0" CHUNK_SIZE = 8192 diff --git a/scripts/lockfile-generators/create-requirements-lockfile.sh b/scripts/lockfile-generators/create-requirements-lockfile.sh index 3e68b99de3..6f77cca855 100755 --- a/scripts/lockfile-generators/create-requirements-lockfile.sh +++ b/scripts/lockfile-generators/create-requirements-lockfile.sh @@ -156,86 +156,13 @@ if [[ "$DO_DOWNLOAD" == true ]]; then # Output directory must match Cachi2 layout so prefetched wheels are found # during hermetic/offline builds (e.g. Docker COPY from cachi2/output/deps/pip). - OUT_DIR="cachi2/output/deps/pip" + OUT_DIR="${CACHI2_OUT_DIR:-cachi2/output}/deps/pip" mkdir -p "$OUT_DIR" - # Use sha256sum on Linux, shasum -a 256 on macOS (portable). - if command -v sha256sum &>/dev/null; then - sha256_of() { sha256sum "$1" | cut -d' ' -f1; } - else - sha256_of() { shasum -a 256 "$1" | cut -d' ' -f1; } - fi - - # Count lines in pylock that look like "url = \"...\" ... sha256 = \"...\"" - # (one per wheel; multi-line wheel blocks have one such line per wheel). - total=$(grep -c 'url = ".*sha256 = "' "$PYLOCK_FILE" || true) - echo " ${total} wheel(s) to download into ${OUT_DIR}/" - echo "" - - idx=0 - # Read one line per wheel from the lockfile (same pattern as above). - while IFS= read -r line; do - idx=$((idx + 1)) - - # Extract URL and expected sha256 from lockfile line (TOML-style). - url=$(echo "$line" | sed 's/.*url = "\([^"]*\)".*/\1/') - sha=$(echo "$line" | sed 's/.*sha256 = "\([^"]*\)".*/\1/') - - if [[ -z "$url" || -z "$sha" ]]; then - echo " ERROR: failed to parse url or sha256 from lockfile line (wheel ${idx})" >&2 - echo " line: ${line:0:120}..." >&2 - exit 1 - fi - - # Filename is the last path segment of the URL, without query/fragment. - filename="${url##*/}"; filename="${filename%%[?#]*}" - if [[ -z "$filename" ]]; then - echo " ERROR: could not derive filename from URL (wheel ${idx})" >&2 - echo " URL: ${url}" >&2 - exit 1 - fi - dest="${OUT_DIR}/${filename}" - - echo "[${idx}/${total}] ${filename}" - - # Resume partial runs: reuse a file only if its digest matches this wheel. - if [[ -f "$dest" ]]; then - actual=$(sha256_of "$dest") - if [[ "$actual" == "$sha" ]]; then - echo " Already present (checksum OK), skipping download." - else - echo " WARNING: Ignoring stale or mismatched cached wheel (digest does not match this lockfile entry)." >&2 - echo " file: ${dest}" >&2 - echo " got: ${actual}" >&2 - echo " expected: ${sha}" >&2 - echo " Removing cached file and re-downloading." >&2 - rm -f "$dest" - fi - fi - - if [[ ! -f "$dest" ]]; then - echo " Downloading: ${url}" - if ! wget -q -O "$dest" "$url"; then - echo " ERROR: download failed for ${filename}" >&2 - echo " URL: ${url}" >&2 - echo " Run 'wget -O /dev/null \"${url}\"' to see the full error." >&2 - rm -f "$dest" - exit 1 - fi - fi - - # Verify digest so corrupted downloads are detected. - actual=$(sha256_of "$dest") - if [[ "$actual" != "$sha" ]]; then - echo " ERROR: checksum mismatch (got ${actual}, expected ${sha})" >&2 - rm -f "$dest" - exit 1 - fi - echo " Checksum OK (sha256:${actual:0:16}...)" - done < <(grep 'url = ".*sha256 = "' "$PYLOCK_FILE") + # Delegate to python script for parallel downloading and filtering. + python3 scripts/lockfile-generators/helpers/download-pip-packages.py \ + --output-dir "$OUT_DIR" ${ARCH:+--arch "$ARCH"} "$REQUIREMENTS_FILE" - echo "" - echo "Done: ${total} file(s) present and validated in ${OUT_DIR}/" fi echo "" @@ -243,5 +170,5 @@ echo "=== All done ===" echo " pylock.toml : ${PYLOCK_FILE}" echo " requirements : ${REQUIREMENTS_FILE}" if [[ "$DO_DOWNLOAD" == true ]]; then - echo " wheels : cachi2/output/deps/pip/" -fi \ No newline at end of file + echo " wheels : ${OUT_DIR}/" +fi diff --git a/scripts/lockfile-generators/download-npm.sh b/scripts/lockfile-generators/download-npm.sh index 515e0bdffa..693dbba18f 100755 --- a/scripts/lockfile-generators/download-npm.sh +++ b/scripts/lockfile-generators/download-npm.sh @@ -24,7 +24,7 @@ set -euo pipefail # --- Configuration & Defaults --- SCRIPTS_PATH="scripts/lockfile-generators" -DEST_DIR="./cachi2/output/deps/npm" +DEST_DIR="${CACHI2_OUT_DIR:-cachi2/output}/deps/npm" LOCKFILE="" TEKTON_FILE="" @@ -245,34 +245,33 @@ fi mkdir -p "$DEST_DIR" total=$(echo "$refs" | wc -l | tr -d ' ') -count=0 -downloaded=0 -skipped=0 -failed=0 echo "" echo "Found $total unique packages to download." echo "" -while IFS=$'\t' read -r filename download_url; do - count=$((count + 1)) - +export DEST_DIR +download_file() { + local filename="$1" + local download_url="$2" if [[ -f "$DEST_DIR/$filename" ]]; then - echo "[$count/$total] SKIP Already exists: $filename" - skipped=$((skipped + 1)) + echo "SKIP Already exists: $filename" else if wget -q -O "$DEST_DIR/$filename" "$download_url"; then - echo "[$count/$total] OK Downloaded: $filename" - downloaded=$((downloaded + 1)) + echo "OK Downloaded: $filename" else - echo "[$count/$total] FAIL Failed: $download_url" >&2 - failed=$((failed + 1)) - # Clean up partial download + echo "FAIL Failed: $download_url" >&2 rm -f "$DEST_DIR/$filename" + return 1 fi fi -done <<< "$refs" +} +export -f download_file + +if ! echo "$refs" | xargs -n 2 -P 10 bash -c 'download_file "$1" "$2"' _; then + echo "Some npm downloads failed" >&2 + exit 1 +fi echo "" -echo "Finished! Total: $total Downloaded: $downloaded Skipped: $skipped Failed: $failed" -echo "Location: $DEST_DIR" +echo "Finished! Location: $DEST_DIR" diff --git a/scripts/lockfile-generators/helpers/download-pip-packages.py b/scripts/lockfile-generators/helpers/download-pip-packages.py index e35735fbae..1c8fc0ebff 100644 --- a/scripts/lockfile-generators/helpers/download-pip-packages.py +++ b/scripts/lockfile-generators/helpers/download-pip-packages.py @@ -22,8 +22,8 @@ 5. Verify every file's sha256 checksum (whether freshly downloaded or cached). Usage: - python3 scripts/lockfile-generators/download-pip-packages.py \\ - [-o OUTPUT_DIR] + python3 scripts/lockfile-generators/download-pip-packages.py \ + [-o OUTPUT_DIR] [--arch ARCH] Can be invoked standalone or by create-requirements-lockfile.sh (which has its own inline download step for pylock.toml-based workflows). @@ -35,9 +35,11 @@ import subprocess import sys import urllib.request +import os +import concurrent.futures from pathlib import Path -OUT_DIR = Path("cachi2/output/deps/pip") +OUT_DIR = Path(os.environ.get("CACHI2_OUT_DIR", "cachi2/output")) / "deps" / "pip" PYPI_JSON = "https://pypi.org/pypi/{name}/{version}/json" @@ -47,6 +49,9 @@ def get_and_validate_args(): parser.add_argument( "-o", "--output-dir", type=Path, default=OUT_DIR, help=f"Output directory (default: {OUT_DIR})", ) + parser.add_argument( + "--arch", type=str, default=os.environ.get("ARCH", "amd64"), help="Target architecture (e.g. amd64, arm64)" + ) args = parser.parse_args() req_path = args.requirements.resolve() out_dir = args.output_dir.resolve() @@ -54,7 +59,7 @@ def get_and_validate_args(): print(f"Error: not a file: {req_path}", file=sys.stderr) sys.exit(1) out_dir.mkdir(parents=True, exist_ok=True) - return req_path, out_dir + return req_path, out_dir, args.arch def detect_index_url(req_path: Path): @@ -104,8 +109,9 @@ def fetch_pypi_urls(name: str, version: str, wanted_hashes: set): """Return list of (url, filename, sha256) for urls whose sha256 is in wanted_hashes.""" url = PYPI_JSON.format(name=name, version=version) try: - with urllib.request.urlopen(url, timeout=30) as r: - data = json.load(r) + req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) + with urllib.request.urlopen(req, timeout=30) as r: + data = json.loads(r.read().decode()) except Exception as e: print(f"Error fetching {url}: {e}", file=sys.stderr) return [] @@ -127,15 +133,11 @@ def fetch_pypi_urls(name: str, version: str, wanted_hashes: set): def fetch_simple_index_urls(index_url: str, name: str, version: str, wanted_hashes: set): - """Return list of (url, filename, sha256) from a PEP 503 simple index page. - - Used for RHOAI and other custom indexes that don't provide a JSON API. - """ - # Normalize name for URL: PEP 503 uses lowercase with hyphens + """Return list of (url, filename, sha256) from a PEP 503 simple index page.""" normalized = re.sub(r"[-_.]+", "-", name).lower() page_url = f"{index_url.rstrip('/')}/{normalized}/" try: - req = urllib.request.Request(page_url, headers={"Accept": "text/html"}) + req = urllib.request.Request(page_url, headers={"Accept": "text/html", "User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=30) as r: html = r.read().decode() except Exception as e: @@ -162,17 +164,65 @@ def wget(url: str, path: Path): subprocess.run(["wget", "-q", "-O", str(path), url], check=True) +def download_and_verify(item): + path, expected_hash, url, name, version, filename = item + if not path.exists(): + print(f" Downloading: {filename}") + try: + wget(url, path) + except subprocess.CalledProcessError as e: + return False, f"Error downloading {filename}: {e}" + + actual = file_sha256(path) + if actual != expected_hash: + return False, f"{filename} checksum mismatch (got {actual}, expected {expected_hash})" + return True, f"{filename} OK" + + +def should_keep_for_arch(filename: str, arch: str) -> bool: + # sdists and non-wheel files have no platform tag — always keep + if not filename.endswith(".whl"): + return True + + # Wheel format: {name}-{ver}(-{build})?-{python}-{abi}-{platform}.whl + # The platform tag is the last segment before .whl + stem = filename[:-4] # strip .whl + parts = stem.split("-") + if len(parts) < 3: + return True + platform_tag = parts[-1] + + # Pure-python wheels use "any" or "none" platform tags + if platform_tag in ("any", "none") or "any" in platform_tag.split("_"): + return True + + ARCH_ALIASES = { + "amd64": ["x86_64", "amd64"], + "x86_64": ["x86_64", "amd64"], + "arm64": ["aarch64", "arm64"], + "aarch64": ["aarch64", "arm64"], + "ppc64le": ["ppc64le"], + "s390x": ["s390x"], + } + ALL_ARCHES = {"x86_64", "amd64", "aarch64", "arm64", "ppc64le", "s390x"} + + # Check if the platform tag mentions any known architecture + if not any(a in platform_tag for a in ALL_ARCHES): + return True + + # Keep only if it matches the target architecture + return any(a in platform_tag for a in ARCH_ALIASES.get(arch, [])) + + def main(): - req_path, out_dir = get_and_validate_args() + req_path, out_dir, arch = get_and_validate_args() - # Detect --index-url in requirements file (e.g. RHOAI) index_url = detect_index_url(req_path) use_simple_index = index_url is not None and "pypi.org" not in index_url if use_simple_index: print(f"Detected custom index: {index_url}") print(f"Using PEP 503 simple index for downloads.\n") - # Build list of (path, expected_sha256, url, name, version, filename) per file to have to_fetch = [] for block in get_packages_and_checksums(req_path): name, version, hashes = block_to_name_version_hashes(block) @@ -185,21 +235,25 @@ def main(): results = fetch_pypi_urls(name, version, set(hashes)) for url, filename, expected_hash in results: - to_fetch.append((out_dir / filename, expected_hash, url, name, version, filename)) + if should_keep_for_arch(filename, arch): + to_fetch.append((out_dir / filename, expected_hash, url, name, version, filename)) total = len(to_fetch) - for idx, (path, expected_hash, url, name, version, filename) in enumerate(to_fetch, 1): - print(f"[{idx}/{total}] {name}=={version} {filename}") - if not path.exists(): - print(f" Downloading: {url}") - wget(url, path) - else: - print(f" Already exists, skipping download.") - actual = file_sha256(path) - if actual != expected_hash: - print(f"Error: {path.name} checksum mismatch (got {actual}, expected {expected_hash})", file=sys.stderr) - sys.exit(1) - print(f" Checksum OK (sha256:{actual[:16]}...)") + print(f"Found {total} wheels to fetch for arch {arch}.") + + success = True + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + futures = {executor.submit(download_and_verify, item): item for item in to_fetch} + for future in concurrent.futures.as_completed(futures): + ok, msg = future.result() + if not ok: + print(f"Error: {msg}", file=sys.stderr) + success = False + else: + print(f" {msg}") + + if not success: + sys.exit(1) print(f"Done: {total} file(s) present and validated.") diff --git a/scripts/lockfile-generators/helpers/download-rpms.sh b/scripts/lockfile-generators/helpers/download-rpms.sh index d836413ca5..c2f8a8b47e 100755 --- a/scripts/lockfile-generators/helpers/download-rpms.sh +++ b/scripts/lockfile-generators/helpers/download-rpms.sh @@ -15,7 +15,7 @@ set -euo pipefail # --- Configuration & Defaults --- SCRIPTS_PATH="scripts/lockfile-generators" -DEST_DIR="./cachi2/output/deps/rpm" +DEST_DIR="${CACHI2_OUT_DIR:-cachi2/output}/deps/rpm" LOCKFILE="" # --- Functions --- diff --git a/scripts/lockfile-generators/helpers/hermeto-fetch-gomod.sh b/scripts/lockfile-generators/helpers/hermeto-fetch-gomod.sh index 0b2de1d771..563734f805 100755 --- a/scripts/lockfile-generators/helpers/hermeto-fetch-gomod.sh +++ b/scripts/lockfile-generators/helpers/hermeto-fetch-gomod.sh @@ -12,7 +12,7 @@ set -euo pipefail # directory. No separate lockfile is needed — go.sum pins dependencies. HERMETO_IMAGE="ghcr.io/hermetoproject/hermeto:0.46.2" -HERMETO_OUTPUT="./cachi2/output" +HERMETO_OUTPUT="${CACHI2_OUT_DIR:-cachi2/output}" PREFETCH_DIR="" diff --git a/scripts/lockfile-generators/helpers/hermeto-fetch-rpm.sh b/scripts/lockfile-generators/helpers/hermeto-fetch-rpm.sh index 6aed5cf701..2b0430d0be 100755 --- a/scripts/lockfile-generators/helpers/hermeto-fetch-rpm.sh +++ b/scripts/lockfile-generators/helpers/hermeto-fetch-rpm.sh @@ -21,12 +21,13 @@ set -euo pipefail UBI9_IMAGE="registry.access.redhat.com/ubi9/ubi" HERMETO_IMAGE="ghcr.io/hermetoproject/hermeto:0.46.2" -HERMETO_OUTPUT="./cachi2/output" +HERMETO_OUTPUT="${CACHI2_OUT_DIR:-cachi2/output}" PREFETCH_DIR="" CERT_DIR="" ACTIVATION_KEY="" ORG="" +ARCH="${ARCH:-}" show_help() { cat << 'EOF' @@ -39,6 +40,7 @@ Options: --cert-dir DIR Directory with pre-extracted entitlement PEM files --activation-key KEY Red Hat activation key for RHEL cert extraction --org ORG Red Hat organization ID for RHEL cert extraction + --arch ARCH Target architecture (e.g. amd64, aarch64) to filter downloads --help Show this help Environment variables (fallback when CLI args are not provided): @@ -63,6 +65,8 @@ while [[ $# -gt 0 ]]; do ACTIVATION_KEY="$2"; shift 2 ;; --org) [[ $# -ge 2 ]] || error_exit "--org requires a value" ORG="$2"; shift 2 ;; + --arch) [[ $# -ge 2 ]] || error_exit "--arch requires a value" + ARCH="$2"; shift 2 ;; -h|--help) show_help; exit 0 ;; *) error_exit "Unknown argument: '$1'" ;; esac @@ -207,11 +211,28 @@ fi # prefetch steps already placed in cachi2/output/deps/. # ========================================================================= HERMETO_STAGING=$(mktemp -d) -trap 'rm -rf "$HERMETO_STAGING" ${CDN_CERT_DIR:+"$CDN_CERT_DIR"}' EXIT +trap 'rm -rf "$HERMETO_STAGING" ${CDN_CERT_DIR:+"$CDN_CERT_DIR"} ${CLEANUP_SOURCE:+"$HERMETO_SOURCE"}' EXIT + +HERMETO_SOURCE="$(pwd)/$PREFETCH_DIR" +if [[ -n "$ARCH" ]]; then + command -v yq &>/dev/null || error_exit "--arch requires yq to filter rpms.lock.yaml" + # Translate generic arch to rpm arch format + rpm_arch="$ARCH" + [[ "$ARCH" == "amd64" ]] && rpm_arch="x86_64" + [[ "$ARCH" == "arm64" ]] && rpm_arch="aarch64" + + echo "--- Filtering rpms.lock.yaml for architecture: $rpm_arch ---" + HERMETO_SOURCE=$(mktemp -d) + CLEANUP_SOURCE=1 + # Symlink everything, then replace rpms.lock.yaml with a filtered copy + ln -s "$(pwd)/$PREFETCH_DIR"/* "$HERMETO_SOURCE/" 2>/dev/null || true + rm -f "$HERMETO_SOURCE/rpms.lock.yaml" + yq eval "del(.arches[] | select(.arch != \"$rpm_arch\" and .arch != \"noarch\"))" "$(pwd)/$PREFETCH_DIR/rpms.lock.yaml" > "$HERMETO_SOURCE/rpms.lock.yaml" +fi echo "--- Downloading RPMs via hermeto ---" podman run --rm \ - -v "$(pwd)/$PREFETCH_DIR:/source:z" \ + -v "$HERMETO_SOURCE:/source:z" \ -v "$HERMETO_STAGING:/output:z" \ ${CDN_CERT_DIR:+-v "$CDN_CERT_DIR:/certs:ro,z"} \ "$HERMETO_IMAGE" \ diff --git a/scripts/lockfile-generators/prefetch-all.sh b/scripts/lockfile-generators/prefetch-all.sh index 90c36f78bf..be73943077 100755 --- a/scripts/lockfile-generators/prefetch-all.sh +++ b/scripts/lockfile-generators/prefetch-all.sh @@ -8,16 +8,16 @@ set -euo pipefail # This script orchestrates downloading all five dependency types: # # 1. Generic artifacts — GPG keys, nfpm-built RPMs, Node.js headers, -# Electron binaries, VS Code .vsix (into cachi2/output/deps/generic/). +# Electron binaries, VS Code .vsix (into ${CACHI2_OUT_DIR:-cachi2/output}/deps/generic/). # Component-specific: codeserver gets ripgrep from pip, oc from RPM. # 2. Pip wheels — Python packages resolved from pyproject.toml -# (into cachi2/output/deps/pip/). +# (into ${CACHI2_OUT_DIR:-cachi2/output}/deps/pip/). # 3. NPM packages — tarballs resolved from package-lock.json files -# (into cachi2/output/deps/npm/). +# (into ${CACHI2_OUT_DIR:-cachi2/output}/deps/npm/). # 4. RPMs — system packages resolved from rpms.lock.yaml via Hermeto -# (into cachi2/output/deps/rpm/). +# (into ${CACHI2_OUT_DIR:-cachi2/output}/deps/rpm/). # 5. Go modules — Go deps from go.mod/go.sum via Hermeto (gomod) -# (into cachi2/output/deps/gomod/). +# (into ${CACHI2_OUT_DIR:-cachi2/output}/deps/gomod/). # # Each step is skipped if its input file is not present in the component's # prefetch-input// directory. Step 3 (NPM) discovers the Tekton @@ -47,6 +47,9 @@ SCRIPTS_PATH="scripts/lockfile-generators" COMPONENT_DIR="" VARIANT="odh" # "odh" = upstream (CentOS Stream), "rhds" = downstream (RHEL) FLAVOR="cpu" # selects which pylock/requirements files to use (cpu, cuda, rocm) +ARCH=$(uname -m) +[[ "$ARCH" == "x86_64" ]] && ARCH="amd64" +[[ "$ARCH" == "aarch64" ]] && ARCH="arm64" ACTIVATION_KEY="" ORG="" @@ -54,13 +57,14 @@ show_help() { cat << 'HELPEOF' Usage: scripts/lockfile-generators/prefetch-all.sh [OPTIONS] -Download all hermetic build dependencies into cachi2/output/deps/. +Download all hermetic build dependencies into ${CACHI2_OUT_DIR:-cachi2/output}/deps/. Options: --component-dir DIR Component directory (required) e.g. codeserver/ubi9-python-3.12 --rhds Use downstream (RHDS) lockfiles instead of upstream (ODH) --flavor NAME Lock file flavor (default: cpu) + --arch NAME Target architecture (default: host architecture) --activation-key KEY Red Hat activation key for RHEL RPMs (optional) --org ORG Red Hat organization ID for RHEL RPMs (optional) -h, --help Show this help @@ -125,6 +129,8 @@ while [[ $# -gt 0 ]]; do --rhds) VARIANT="rhds"; shift ;; --flavor) [[ $# -ge 2 ]] || error_exit "--flavor requires a value" FLAVOR="$2"; shift 2 ;; + --arch) [[ $# -ge 2 ]] || error_exit "--arch requires a value" + ARCH="$2"; shift 2 ;; --activation-key) [[ $# -ge 2 ]] || error_exit "--activation-key requires a value" ACTIVATION_KEY="$2"; shift 2 ;; --org) [[ $# -ge 2 ]] || error_exit "--org requires a value" @@ -135,8 +141,13 @@ while [[ $# -gt 0 ]]; do done [[ -z "$COMPONENT_DIR" ]] && error_exit "--component-dir is required." +COMPONENT_DIR="${COMPONENT_DIR%/}" [[ -d "$COMPONENT_DIR" ]] || error_exit "Component directory not found: $COMPONENT_DIR" +CACHI2_HASH="$(python3 -c "import hashlib, sys; print(hashlib.md5(sys.argv[1].encode()).hexdigest())" "$COMPONENT_DIR")" +export CACHI2_OUT_DIR="cachi2/output/${CACHI2_HASH}" +export ARCH + # CLI args take priority; fall back to env vars so GHA can pass secrets # without exposing them on the command line. GitHub Actions masks env var # values in logs, but command-line args appear in process listings. @@ -198,7 +209,7 @@ STEPS_SKIPPED=0 # Downloads non-package artifacts listed in artifacts.in.yaml: GPG keys # for RPM signature verification, nfpm-packaged RPMs (e.g. code-server), # Node.js headers for native addons, Electron binaries, etc. -# Output: cachi2/output/deps/generic/ +# Output: ${CACHI2_OUT_DIR:-cachi2/output}/deps/generic/ # ========================================================================= ARTIFACTS_INPUT="$VARIANT_DIR/artifacts.in.yaml" if [[ -f "$ARTIFACTS_INPUT" ]]; then @@ -219,7 +230,7 @@ fi # pylock..toml + requirements..txt, then downloads all # wheels. The --flavor flag selects which optional dependency groups # to include (e.g. cpu vs cuda have different torch/triton packages). -# Output: cachi2/output/deps/pip/ +# Output: ${CACHI2_OUT_DIR:-cachi2/output}/deps/pip/ # ========================================================================= PYPROJECT="$COMPONENT_DIR/pyproject.toml" if [[ -f "$PYPROJECT" ]]; then @@ -240,7 +251,7 @@ fi # download npm tarballs from the prefetch-input paths listed there. If no # Tekton file is found for this component, skip npm. Requires yq. # -# Output: cachi2/output/deps/npm/ +# Output: ${CACHI2_OUT_DIR:-cachi2/output}/deps/npm/ # ========================================================================= echo "=== [3/5] NPM packages ===" @@ -273,7 +284,7 @@ echo "" # ========================================================================= # Step 4: RPMs (hermeto-fetch-rpm.sh or create-rpm-lockfile.sh --download) # -# Downloads OS-level RPM packages into cachi2/output/deps/rpm/ and creates +# Downloads OS-level RPM packages into ${CACHI2_OUT_DIR:-cachi2/output}/deps/rpm/ and creates # DNF repo metadata so the Dockerfile can `dnf install` offline. # # Two modes depending on whether rpms.lock.yaml already exists: @@ -291,7 +302,7 @@ echo "" # exported env vars (SUBSCRIPTION_ACTIVATION_KEY / SUBSCRIPTION_ORG), # not command-line args. hermeto-fetch-rpm.sh handles cert extraction. # -# Output: cachi2/output/deps/rpm/ +# Output: ${CACHI2_OUT_DIR:-cachi2/output}/deps/rpm/ # ========================================================================= RPM_INPUT="$VARIANT_DIR/rpms.in.yaml" RPM_LOCKFILE="$VARIANT_DIR/rpms.lock.yaml" @@ -316,7 +327,7 @@ fi # # Prefetches Go dependencies from go.mod/go.sum via Hermeto (gomod). Uses # the same Tekton file as NPM to discover gomod-type prefetch-input paths. -# Output: cachi2/output/deps/gomod/ +# Output: ${CACHI2_OUT_DIR:-cachi2/output}/deps/gomod/ # ========================================================================= echo "=== [5/5] Go modules ===" if [[ -n "$tekton_file" ]] && command -v yq &>/dev/null; then @@ -354,10 +365,10 @@ echo " tekton file: $tekton_file" echo " steps run : $STEPS_RUN" echo " steps skipped: $STEPS_SKIPPED" echo "" -echo " Dependencies are in: cachi2/output/deps/" -if [[ -d "cachi2/output/deps" ]]; then +echo " Dependencies are in: ${CACHI2_OUT_DIR:-cachi2/output}/deps/" +if [[ -d "${CACHI2_OUT_DIR:-cachi2/output}/deps" ]]; then echo "" - du -sh cachi2/output/deps/*/ 2>/dev/null || true + du -sh ${CACHI2_OUT_DIR:-cachi2/output}/deps/*/ 2>/dev/null || true fi echo "" echo " Next: run 'make ' — it will auto-detect cachi2/output/"