diff --git a/validation/Containerfile b/validation/Containerfile index 0baca89..c59e894 100644 --- a/validation/Containerfile +++ b/validation/Containerfile @@ -1,13 +1,12 @@ -ARG BASEIMAGE=registry.fedoraproject.org/fedora:latest -FROM ${BASEIMAGE} +FROM registry.access.redhat.com/ubi9/ubi-minimal:9.5 -RUN source /etc/os-release && \ - if [ "${PLATFORM_ID}" == "platform:el9" ]; then dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm; fi && \ - if [ "${PLATFORM_ID}" == "platform:el10" ]; then dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm; fi +RUN microdnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + microdnf install -y python3 python3-configargparse python3-kubernetes && \ + microdnf clean all -RUN dnf install -y python3-configargparse python3-kubernetes python3-pip python3-build +COPY llmd_xks_preflight.py /opt/llmd-xks-preflight/ -COPY . /root/src -RUN python3 -m build /root/src -w -o /root/src && python3 -m pip install --no-deps /root/src/*.whl && rm -rf /root/src +RUN useradd -r -u 1001 -g 0 preflight +USER 1001 -ENTRYPOINT ["/usr/local/bin/llmd-xks-preflight"] +ENTRYPOINT ["python3", "/opt/llmd-xks-preflight/llmd_xks_preflight.py"] diff --git a/validation/Makefile b/validation/Makefile index 9599fb6..27c18b8 100644 --- a/validation/Makefile +++ b/validation/Makefile @@ -1,57 +1,62 @@ # Configurable settings -MAX_LINE_LENGTH ?= 120 CONTAINER_REPO ?= localhost/llmd-xks-checks CONTAINER_TAG ?= latest -CONTAINER_TOOL ?= podman +CONTAINER_TOOL ?= $(shell command -v podman >/dev/null 2>&1 && echo podman || echo docker) HOST_KUBECONFIG ?= ~/.kube/config -FROM ?= registry.fedoraproject.org/fedora:latest -.PHONY: help container run push lint pep8-fix +# SUITE can be set to "cluster" or "operators", defaults to "all" +SUITE ?= all + +# SELinux label for volume mounts (only needed for podman) +VOLUME_OPTS ?= $(shell [ "$(CONTAINER_TOOL)" = "podman" ] && echo ":ro,Z" || echo ":ro") + +# CONFIG can be set to a config file path to mount into the container +CONFIG ?= +# Config mount and argument (only if CONFIG is set) +CONFIG_MOUNT ?= $(if $(CONFIG),--volume $(CONFIG):/tmp/config.conf$(VOLUME_OPTS),) +CONFIG_ARG ?= $(if $(CONFIG),--config /tmp/config.conf,) + +.PHONY: help image run push lint pep8-fix help: @echo "Available targets:" - @echo " container Build a container image from the current directory" - @echo " run Run the container image with all tests" - @echo " run-cluster Run the container image with cluster readiness tests" - @echo " run-operators Run the container image with operators readiness tests" - @echo " push Push the container image to the container registry" + @echo " image Build a container image from the current directory" + @echo " run Run the image with tests (use SUITE=cluster|operators|all)" + @echo " push Push the image to the container registry" @echo " lint Check code for PEP8 compliance" @echo " pep8-fix Automatically fix PEP8 compliance issues" @echo "" @echo "Configuration settings (all can be overridden by using environment variables):" - @echo " MAX_LINE_LENGTH=$(MAX_LINE_LENGTH) Python linter line length" @echo " CONTAINER_REPO=$(CONTAINER_REPO) Container repository tag to use for build and run" @echo " CONTAINER_TAG=$(CONTAINER_TAG) Container tag to use for build and run" @echo " CONTAINER_TOOL=$(CONTAINER_TOOL) Container tool to use for build and run" @echo " HOST_KUBECONFIG=$(HOST_KUBECONFIG) Path to kubeconfig for container run" - @echo " FROM=$(FROM) Base image to use for the container build" + @echo " SUITE=$(SUITE) Test suite to run (all, cluster, operators)" + @echo " CONFIG=$(CONFIG) Path to config file to mount into the container" # Build a container image from the current directory -container: - $(CONTAINER_TOOL) build $(FROM:%=--build-arg BASEIMAGE=%) --tag $(CONTAINER_REPO):$(CONTAINER_TAG) . +image: + $(CONTAINER_TOOL) build --tag $(CONTAINER_REPO):$(CONTAINER_TAG) . -# Run the container image with all tests +# Run the container image with tests run: - $(CONTAINER_TOOL) run --rm -it --volume $(HOST_KUBECONFIG):/root/.kube/config:ro,Z $(CONTAINER_REPO):$(CONTAINER_TAG) - -# Run the container image with cluster readiness tests -run-cluster: - $(CONTAINER_TOOL) run --rm -it --volume $(HOST_KUBECONFIG):/root/.kube/config:ro,Z $(CONTAINER_REPO):$(CONTAINER_TAG) -s cluster - -# Run the container image with operators readiness tests -run-operators: - $(CONTAINER_TOOL) run --rm -it --volume $(HOST_KUBECONFIG):/root/.kube/config:ro,Z $(CONTAINER_REPO):$(CONTAINER_TAG) -s operators + $(CONTAINER_TOOL) run --rm -it --volume $(HOST_KUBECONFIG):/tmp/kubeconfig$(VOLUME_OPTS) $(CONFIG_MOUNT) -e KUBECONFIG=/tmp/kubeconfig $(CONTAINER_REPO):$(CONTAINER_TAG) -s $(SUITE) $(CONFIG_ARG) # Push the container image to the container registry push: $(CONTAINER_TOOL) push $(CONTAINER_REPO):$(CONTAINER_TAG) +# Linting settings +MAX_LINE_LENGTH ?= 120 + # Check code for PEP8 compliance lint: + @command -v flake8 >/dev/null 2>&1 || pip install flake8 flake8 --max-line-length=$(MAX_LINE_LENGTH) --exclude=build . # Automatically fix PEP8 compliance issues pep8-fix: + @command -v autopep8 >/dev/null 2>&1 || pip install autopep8 autopep8 --max-line-length=$(MAX_LINE_LENGTH) --in-place --recursive . diff --git a/validation/README.md b/validation/README.md index 7c602d9..d9080a8 100644 --- a/validation/README.md +++ b/validation/README.md @@ -15,6 +15,7 @@ A CLI application for running validation checks against Kubernetes clusters in t | Cloud provider | Managed K8s Service | | -------------- | ------------------- | | [Azure](https://azure.microsoft.com) | [AKS](https://azure.microsoft.com/en-us/products/kubernetes-service) | + ## Container image build @@ -24,24 +25,15 @@ This tool can be packaged and run as a container image and a Containerfile is pr In order to build a container locally: ```bash -make container +make image ``` -By default, the container is built on top of latest Fedora container image. If you have an **entitled Red Hat Enterprise Linux system**, you can use UBI9 (Universal Basic Image) as the base: +The container is built on top of UBI9 (Universal Base Image 9.5). -```bash -FROM=registry.access.redhat.com/ubi9:latest make container -``` - -Notes: - * currently, only UBI version 9 (based on Red Hat Enterprise Linux 9) is supported - * while the base image itself can be pulled without registration, the container image will not build without a valid Red Hat entitlement -- if you are running a registered RHEL system, the entitlement is automatically passed to the container at build time - -Regardless of base image, the resulting container image repository (name) and tag can be customized by using `CONTAINER_REPO` and `CONTAINER_TAG` environment variables: +The resulting container image repository (name) and tag can be customized by using `CONTAINER_REPO` and `CONTAINER_TAG` environment variables: ```bash -CONTAINER_REPO=quay.io/myusername/llm-d-xks-preflight CONTAINER_TAG=mytag make container -FROM=registry.access.redhat.com/ubi9:latest CONTAINER_REPO=quay.io/myusername/llm-d-xks-preflight CONTAINER_TAG=mytag make container +CONTAINER_REPO=quay.io/myusername/llm-d-xks-preflight CONTAINER_TAG=mytag make image ``` ## Container image run @@ -49,8 +41,13 @@ FROM=registry.access.redhat.com/ubi9:latest CONTAINER_REPO=quay.io/myusername/ll After building the container image as described above, a helper script to run the validations against a Kubernetes cluster is available: ```bash -# using defaults +# run all tests make run + +# run specific test suite (cluster or operators) +SUITE=cluster make run +SUITE=operators make run + # if the image name and tag have been customized CONTAINER_REPO=quay.io/myusername/llm-d-xks-preflight CONTAINER_TAG=mytag make run ``` @@ -118,7 +115,7 @@ The application automatically looks for config files in the following locations You can also specify a custom config file: ```bash -python llmd_xks_checks.py --config /path/to/config.conf +CONFIG=/path/to/config.conf make run ``` Example config file: @@ -132,4 +129,5 @@ cloud_provider = azure - `LLMD_XKS_LOG_LEVEL`: Log level (same choices as `--log-level`) - `LLMD_XKS_CLOUD_PROVIDER`: Cloud provider (choices: auto, azure) +- `LLMD_XKS_SUITE`: Test suite to run (choices: all(default), cluster, operators) - `KUBECONFIG`: Path to kubeconfig file (standard Kubernetes environment variable) diff --git a/validation/llmd_xks_checks.py b/validation/llmd_xks_preflight.py similarity index 74% rename from validation/llmd_xks_checks.py rename to validation/llmd_xks_preflight.py index 863f058..0de6f6d 100755 --- a/validation/llmd_xks_checks.py +++ b/validation/llmd_xks_preflight.py @@ -40,90 +40,90 @@ def __init__(self, **kwargs): self.crds_cache = None self.tests = { - "cluster": { - "description": "Cluster readiness tests", - "tests": [ - { - "name": "instance_type", - "function": self.test_instance_type, - "description": "Test if the cluster has at least one supported instance type", - "suggested_action": "Provision a cluster with at least one supported instance type", - "result": False - }, - { - "name": "gpu_availability", - "function": self.test_gpu_availability, - "description": "Test if the cluster has GPU drivers", - "suggested_action": "Provision a cluster with at least one supported GPU driver", - "result": False - }, - ] + "cluster": { + "description": "Cluster readiness tests", + "tests": [ + { + "name": "instance_type", + "function": self.test_instance_type, + "description": "Test if the cluster has at least one supported instance type", + "suggested_action": "Provision a cluster with at least one supported instance type", + "result": False }, - "operators": { - "description": "Operators readiness tests", - "tests": [ - { - "name": "crd_certmanager", - "function": self.test_crd_certmanager, - "description": "test if the cluster has the cert-manager crds", - "suggested_action": "install cert-manager", - "result": False - }, - { - "name": "operator_certmanager", - "function": self.test_operator_certmanager, - "description": "test if the cert-manager operator is running properly", - "suggested_action": "install or verify cert-manager deployment", - "result": False - }, - { - "name": "crd_sailoperator", - "function": self.test_crd_sailoperator, - "description": "test if the cluster has the sailoperator crds", - "suggested_action": "install sail-operator", - "result": False - }, - { - "name": "operator_sail", - "function": self.test_operator_sail, - "description": "test if the sail operator is running properly", - "suggested_action": "install or verify sail operator deployment", - "result": False - }, - { - "name": "crd_lwsoperator", - "function": self.test_crd_lwsoperator, - "description": "test if the cluster has the lws-operator crds", - "suggested_action": "install lws-operator", - "result": False, - "optional": True - }, - { - "name": "operator_lws", - "function": self.test_operator_lws, - "description": "test if the lws-operator is running properly", - "suggested_action": "install or verify lws operator deployment", - "result": False, - "optional": True - }, - { - "name": "crd_kserve", - "function": self.test_crd_kserve, - "description": "test if the cluster has the kserve crds", - "suggested_action": "install kserve", - "result": False, - "optional": False - }, - { - "name": "operator_kserve", - "function": self.test_operator_kserve, - "description": "test if the kserve controller is running properly", - "suggested_action": "install or verify kserve deployment", - "result": False, - }, - ] - } + { + "name": "gpu_availability", + "function": self.test_gpu_availability, + "description": "Test if the cluster has GPU drivers", + "suggested_action": "Provision a cluster with at least one supported GPU driver", + "result": False + }, + ] + }, + "operators": { + "description": "Operators readiness tests", + "tests": [ + { + "name": "crd_certmanager", + "function": self.test_crd_certmanager, + "description": "test if the cluster has the cert-manager crds", + "suggested_action": "install cert-manager", + "result": False + }, + { + "name": "operator_certmanager", + "function": self.test_operator_certmanager, + "description": "test if the cert-manager operator is running properly", + "suggested_action": "install or verify cert-manager deployment", + "result": False + }, + { + "name": "crd_sailoperator", + "function": self.test_crd_sailoperator, + "description": "test if the cluster has the sailoperator crds", + "suggested_action": "install sail-operator", + "result": False + }, + { + "name": "operator_sail", + "function": self.test_operator_sail, + "description": "test if the sail operator is running properly", + "suggested_action": "install or verify sail operator deployment", + "result": False + }, + { + "name": "crd_lwsoperator", + "function": self.test_crd_lwsoperator, + "description": "test if the cluster has the lws-operator crds", + "suggested_action": "install lws-operator", + "result": False, + "optional": True + }, + { + "name": "operator_lws", + "function": self.test_operator_lws, + "description": "test if the lws-operator is running properly", + "suggested_action": "install or verify lws operator deployment", + "result": False, + "optional": True + }, + { + "name": "crd_kserve", + "function": self.test_crd_kserve, + "description": "test if the cluster has the kserve crds", + "suggested_action": "install kserve", + "result": False, + "optional": False + }, + { + "name": "operator_kserve", + "function": self.test_operator_kserve, + "description": "test if the kserve controller is running properly", + "suggested_action": "install or verify kserve deployment", + "result": False, + }, + ] } + } def _log_init(self): logger = logging.getLogger(__name__) @@ -167,7 +167,7 @@ def _test_crds_present(self, required_crds): def _deployment_ready(self, namespace_name, deployment_name): try: deployment = self.k8s_client.AppsV1Api().read_namespaced_deployment( - name=deployment_name, namespace=namespace_name) + name=deployment_name, namespace=namespace_name) except Exception as e: self.logger.error(f"{e}") return False @@ -278,13 +278,13 @@ def nvidia_driver_present(node): return True else: self.logger.warning( - f"No allocatable NVIDIA GPUs on node {node.metadata.name}" - " - no NVIDIA GPU drivers present") + f"No allocatable NVIDIA GPUs on node {node.metadata.name}" + " - no NVIDIA GPU drivers present") return False else: self.logger.warning( - f"No NVIDIA GPU drivers present on node {node.metadata.name}" - " - no NVIDIA GPU accelerators present") + f"No NVIDIA GPU drivers present on node {node.metadata.name}" + " - no NVIDIA GPU accelerators present") return False gpu_found = False accelerators = { @@ -463,7 +463,7 @@ def cli_arguments(): default="all", env_var="LLMD_XKS_SUITE", help="Test suite to execute" - ) + ) return parser.parse_args() diff --git a/validation/pyproject.toml b/validation/pyproject.toml index 1290648..a7d054d 100644 --- a/validation/pyproject.toml +++ b/validation/pyproject.toml @@ -26,7 +26,7 @@ dependencies = [ "Bug Tracker" = "https://github.com/kwozyman/llmd-xks-preflight/issues" [tool.hatch.build.targets.wheel] -packages = ["llmd_xks_checks.py"] +packages = ["llmd_xks_preflight.py"] [project.scripts] -llmd-xks-preflight = "llmd_xks_checks:main" +llmd-xks-preflight = "llmd_xks_preflight:main"