Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions validation/Containerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
ARG BASEIMAGE=registry.fedoraproject.org/fedora:latest
FROM ${BASEIMAGE}
FROM registry.access.redhat.com/ubi9/ubi-minimal:9.5

RUN source /etc/os-release && \
if [ "${PLATFORM_ID}" == "platform:el9" ]; then dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm; fi && \
if [ "${PLATFORM_ID}" == "platform:el10" ]; then dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm; fi
RUN microdnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
microdnf install -y python3 python3-configargparse python3-kubernetes && \
microdnf clean all

RUN dnf install -y python3-configargparse python3-kubernetes python3-pip python3-build
COPY llmd_xks_preflight.py /opt/llmd-xks-preflight/

COPY . /root/src
RUN python3 -m build /root/src -w -o /root/src && python3 -m pip install --no-deps /root/src/*.whl && rm -rf /root/src
RUN useradd -r -u 1001 -g 0 preflight
USER 1001

ENTRYPOINT ["/usr/local/bin/llmd-xks-preflight"]
ENTRYPOINT ["python3", "/opt/llmd-xks-preflight/llmd_xks_preflight.py"]
51 changes: 28 additions & 23 deletions validation/Makefile
Original file line number Diff line number Diff line change
@@ -1,57 +1,62 @@
# Configurable settings
MAX_LINE_LENGTH ?= 120
CONTAINER_REPO ?= localhost/llmd-xks-checks
CONTAINER_TAG ?= latest
CONTAINER_TOOL ?= podman
CONTAINER_TOOL ?= $(shell command -v podman >/dev/null 2>&1 && echo podman || echo docker)
HOST_KUBECONFIG ?= ~/.kube/config
FROM ?= registry.fedoraproject.org/fedora:latest

.PHONY: help container run push lint pep8-fix
# SUITE can be set to "cluster" or "operators", defaults to "all"
SUITE ?= all

# SELinux label for volume mounts (only needed for podman)
VOLUME_OPTS ?= $(shell [ "$(CONTAINER_TOOL)" = "podman" ] && echo ":ro,Z" || echo ":ro")

# CONFIG can be set to a config file path to mount into the container
CONFIG ?=
# Config mount and argument (only if CONFIG is set)
CONFIG_MOUNT ?= $(if $(CONFIG),--volume $(CONFIG):/tmp/config.conf$(VOLUME_OPTS),)
CONFIG_ARG ?= $(if $(CONFIG),--config /tmp/config.conf,)

.PHONY: help image run push lint pep8-fix

help:
@echo "Available targets:"
@echo " container Build a container image from the current directory"
@echo " run Run the container image with all tests"
@echo " run-cluster Run the container image with cluster readiness tests"
@echo " run-operators Run the container image with operators readiness tests"
@echo " push Push the container image to the container registry"
@echo " image Build a container image from the current directory"
@echo " run Run the image with tests (use SUITE=cluster|operators|all)"
@echo " push Push the image to the container registry"
@echo " lint Check code for PEP8 compliance"
@echo " pep8-fix Automatically fix PEP8 compliance issues"
@echo ""
@echo "Configuration settings (all can be overridden by using environment variables):"
@echo " MAX_LINE_LENGTH=$(MAX_LINE_LENGTH) Python linter line length"
@echo " CONTAINER_REPO=$(CONTAINER_REPO) Container repository tag to use for build and run"
@echo " CONTAINER_TAG=$(CONTAINER_TAG) Container tag to use for build and run"
@echo " CONTAINER_TOOL=$(CONTAINER_TOOL) Container tool to use for build and run"
@echo " HOST_KUBECONFIG=$(HOST_KUBECONFIG) Path to kubeconfig for container run"
@echo " FROM=$(FROM) Base image to use for the container build"
@echo " SUITE=$(SUITE) Test suite to run (all, cluster, operators)"
@echo " CONFIG=$(CONFIG) Path to config file to mount into the container"


# Build a container image from the current directory
container:
$(CONTAINER_TOOL) build $(FROM:%=--build-arg BASEIMAGE=%) --tag $(CONTAINER_REPO):$(CONTAINER_TAG) .
image:
$(CONTAINER_TOOL) build --tag $(CONTAINER_REPO):$(CONTAINER_TAG) .

# Run the container image with all tests
# Run the container image with tests
run:
$(CONTAINER_TOOL) run --rm -it --volume $(HOST_KUBECONFIG):/root/.kube/config:ro,Z $(CONTAINER_REPO):$(CONTAINER_TAG)

# Run the container image with cluster readiness tests
run-cluster:
$(CONTAINER_TOOL) run --rm -it --volume $(HOST_KUBECONFIG):/root/.kube/config:ro,Z $(CONTAINER_REPO):$(CONTAINER_TAG) -s cluster

# Run the container image with operators readiness tests
run-operators:
$(CONTAINER_TOOL) run --rm -it --volume $(HOST_KUBECONFIG):/root/.kube/config:ro,Z $(CONTAINER_REPO):$(CONTAINER_TAG) -s operators
$(CONTAINER_TOOL) run --rm -it --volume $(HOST_KUBECONFIG):/tmp/kubeconfig$(VOLUME_OPTS) $(CONFIG_MOUNT) -e KUBECONFIG=/tmp/kubeconfig $(CONTAINER_REPO):$(CONTAINER_TAG) -s $(SUITE) $(CONFIG_ARG)


# Push the container image to the container registry
push:
$(CONTAINER_TOOL) push $(CONTAINER_REPO):$(CONTAINER_TAG)

# Linting settings
MAX_LINE_LENGTH ?= 120

# Check code for PEP8 compliance
lint:
@command -v flake8 >/dev/null 2>&1 || pip install flake8
flake8 --max-line-length=$(MAX_LINE_LENGTH) --exclude=build .

# Automatically fix PEP8 compliance issues
pep8-fix:
@command -v autopep8 >/dev/null 2>&1 || pip install autopep8
autopep8 --max-line-length=$(MAX_LINE_LENGTH) --in-place --recursive .
28 changes: 13 additions & 15 deletions validation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ A CLI application for running validation checks against Kubernetes clusters in t
| Cloud provider | Managed K8s Service |
| -------------- | ------------------- |
| [Azure](https://azure.microsoft.com) | [AKS](https://azure.microsoft.com/en-us/products/kubernetes-service) |
<!-- | [CoreWeave](https://coreweave.com) | [CKS](https://coreweave.com/products/coreweave-kubernetes-service) | (coming soon) -->


## Container image build
Expand All @@ -24,33 +25,29 @@ This tool can be packaged and run as a container image and a Containerfile is pr
In order to build a container locally:

```bash
make container
make image
```

By default, the container is built on top of latest Fedora container image. If you have an **entitled Red Hat Enterprise Linux system**, you can use UBI9 (Universal Basic Image) as the base:
The container is built on top of UBI9 (Universal Base Image 9.5).

```bash
FROM=registry.access.redhat.com/ubi9:latest make container
```

Notes:
* currently, only UBI version 9 (based on Red Hat Enterprise Linux 9) is supported
* while the base image itself can be pulled without registration, the container image will not build without a valid Red Hat entitlement -- if you are running a registered RHEL system, the entitlement is automatically passed to the container at build time

Regardless of base image, the resulting container image repository (name) and tag can be customized by using `CONTAINER_REPO` and `CONTAINER_TAG` environment variables:
The resulting container image repository (name) and tag can be customized by using `CONTAINER_REPO` and `CONTAINER_TAG` environment variables:

```bash
CONTAINER_REPO=quay.io/myusername/llm-d-xks-preflight CONTAINER_TAG=mytag make container
FROM=registry.access.redhat.com/ubi9:latest CONTAINER_REPO=quay.io/myusername/llm-d-xks-preflight CONTAINER_TAG=mytag make container
CONTAINER_REPO=quay.io/myusername/llm-d-xks-preflight CONTAINER_TAG=mytag make image
Comment on lines +28 to +36
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Stale cloud_provider test entry in the Validations table (Line 67).

The cloud_provider row is still listed as a test under "Suite: cluster" in the Validations section (line 67), but this PR removes it from the self.tests["cluster"] dict in the Python script. Users reading this doc will expect a cloud_provider PASSED/FAILED result that will never appear. Consider removing or updating that row to clarify it's automatic detection during initialization, not a reported test.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@validation/README.md` around lines 28 - 36, Update the Validations table in
validation/README.md to remove or update the stale "cloud_provider" row under
"Suite: cluster" so it no longer promises a PASSED/FAILED test; reference that
this check is performed automatically during initialization in the code path
that modifies self.tests["cluster"] (i.e., remove the cloud_provider entry or
annotate it as "auto-detected during init, not a reported test") and ensure the
table text and any headings match the current behavior of self.tests["cluster"]
in the Python script.

```

## Container image run

After building the container image as described above, a helper script to run the validations against a Kubernetes cluster is available:

```bash
# using defaults
# run all tests
make run

# run specific test suite (cluster or operators)
SUITE=cluster make run
SUITE=operators make run

# if the image name and tag have been customized
CONTAINER_REPO=quay.io/myusername/llm-d-xks-preflight CONTAINER_TAG=mytag make run
```
Expand Down Expand Up @@ -118,7 +115,7 @@ The application automatically looks for config files in the following locations

You can also specify a custom config file:
```bash
python llmd_xks_checks.py --config /path/to/config.conf
CONFIG=/path/to/config.conf make run
```

Example config file:
Expand All @@ -132,4 +129,5 @@ cloud_provider = azure

- `LLMD_XKS_LOG_LEVEL`: Log level (same choices as `--log-level`)
- `LLMD_XKS_CLOUD_PROVIDER`: Cloud provider (choices: auto, azure)
- `LLMD_XKS_SUITE`: Test suite to run (choices: all(default), cluster, operators)
- `KUBECONFIG`: Path to kubeconfig file (standard Kubernetes environment variable)
176 changes: 88 additions & 88 deletions validation/llmd_xks_checks.py → validation/llmd_xks_preflight.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,90 +40,90 @@ def __init__(self, **kwargs):
self.crds_cache = None

self.tests = {
"cluster": {
"description": "Cluster readiness tests",
"tests": [
{
"name": "instance_type",
"function": self.test_instance_type,
"description": "Test if the cluster has at least one supported instance type",
"suggested_action": "Provision a cluster with at least one supported instance type",
"result": False
},
{
"name": "gpu_availability",
"function": self.test_gpu_availability,
"description": "Test if the cluster has GPU drivers",
"suggested_action": "Provision a cluster with at least one supported GPU driver",
"result": False
},
]
"cluster": {
"description": "Cluster readiness tests",
"tests": [
{
"name": "instance_type",
"function": self.test_instance_type,
"description": "Test if the cluster has at least one supported instance type",
"suggested_action": "Provision a cluster with at least one supported instance type",
"result": False
},
"operators": {
"description": "Operators readiness tests",
"tests": [
{
"name": "crd_certmanager",
"function": self.test_crd_certmanager,
"description": "test if the cluster has the cert-manager crds",
"suggested_action": "install cert-manager",
"result": False
},
{
"name": "operator_certmanager",
"function": self.test_operator_certmanager,
"description": "test if the cert-manager operator is running properly",
"suggested_action": "install or verify cert-manager deployment",
"result": False
},
{
"name": "crd_sailoperator",
"function": self.test_crd_sailoperator,
"description": "test if the cluster has the sailoperator crds",
"suggested_action": "install sail-operator",
"result": False
},
{
"name": "operator_sail",
"function": self.test_operator_sail,
"description": "test if the sail operator is running properly",
"suggested_action": "install or verify sail operator deployment",
"result": False
},
{
"name": "crd_lwsoperator",
"function": self.test_crd_lwsoperator,
"description": "test if the cluster has the lws-operator crds",
"suggested_action": "install lws-operator",
"result": False,
"optional": True
},
{
"name": "operator_lws",
"function": self.test_operator_lws,
"description": "test if the lws-operator is running properly",
"suggested_action": "install or verify lws operator deployment",
"result": False,
"optional": True
},
{
"name": "crd_kserve",
"function": self.test_crd_kserve,
"description": "test if the cluster has the kserve crds",
"suggested_action": "install kserve",
"result": False,
"optional": False
},
{
"name": "operator_kserve",
"function": self.test_operator_kserve,
"description": "test if the kserve controller is running properly",
"suggested_action": "install or verify kserve deployment",
"result": False,
},
]
}
{
"name": "gpu_availability",
"function": self.test_gpu_availability,
"description": "Test if the cluster has GPU drivers",
"suggested_action": "Provision a cluster with at least one supported GPU driver",
"result": False
},
]
},
"operators": {
"description": "Operators readiness tests",
"tests": [
{
"name": "crd_certmanager",
"function": self.test_crd_certmanager,
"description": "test if the cluster has the cert-manager crds",
"suggested_action": "install cert-manager",
"result": False
},
{
"name": "operator_certmanager",
"function": self.test_operator_certmanager,
"description": "test if the cert-manager operator is running properly",
"suggested_action": "install or verify cert-manager deployment",
"result": False
},
{
"name": "crd_sailoperator",
"function": self.test_crd_sailoperator,
"description": "test if the cluster has the sailoperator crds",
"suggested_action": "install sail-operator",
"result": False
},
{
"name": "operator_sail",
"function": self.test_operator_sail,
"description": "test if the sail operator is running properly",
"suggested_action": "install or verify sail operator deployment",
"result": False
},
{
"name": "crd_lwsoperator",
"function": self.test_crd_lwsoperator,
"description": "test if the cluster has the lws-operator crds",
"suggested_action": "install lws-operator",
"result": False,
"optional": True
},
{
"name": "operator_lws",
"function": self.test_operator_lws,
"description": "test if the lws-operator is running properly",
"suggested_action": "install or verify lws operator deployment",
"result": False,
"optional": True
},
{
"name": "crd_kserve",
"function": self.test_crd_kserve,
"description": "test if the cluster has the kserve crds",
"suggested_action": "install kserve",
"result": False,
"optional": False
},
{
"name": "operator_kserve",
"function": self.test_operator_kserve,
"description": "test if the kserve controller is running properly",
"suggested_action": "install or verify kserve deployment",
"result": False,
},
]
}
}

def _log_init(self):
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -167,7 +167,7 @@ def _test_crds_present(self, required_crds):
def _deployment_ready(self, namespace_name, deployment_name):
try:
deployment = self.k8s_client.AppsV1Api().read_namespaced_deployment(
name=deployment_name, namespace=namespace_name)
name=deployment_name, namespace=namespace_name)
except Exception as e:
self.logger.error(f"{e}")
return False
Expand Down Expand Up @@ -278,13 +278,13 @@ def nvidia_driver_present(node):
return True
else:
self.logger.warning(
f"No allocatable NVIDIA GPUs on node {node.metadata.name}"
" - no NVIDIA GPU drivers present")
f"No allocatable NVIDIA GPUs on node {node.metadata.name}"
" - no NVIDIA GPU drivers present")
return False
else:
self.logger.warning(
f"No NVIDIA GPU drivers present on node {node.metadata.name}"
" - no NVIDIA GPU accelerators present")
f"No NVIDIA GPU drivers present on node {node.metadata.name}"
" - no NVIDIA GPU accelerators present")
return False
gpu_found = False
accelerators = {
Expand Down Expand Up @@ -463,7 +463,7 @@ def cli_arguments():
default="all",
env_var="LLMD_XKS_SUITE",
help="Test suite to execute"
)
)

return parser.parse_args()

Expand Down
4 changes: 2 additions & 2 deletions validation/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ dependencies = [
"Bug Tracker" = "https://github.com/kwozyman/llmd-xks-preflight/issues"

[tool.hatch.build.targets.wheel]
packages = ["llmd_xks_checks.py"]
packages = ["llmd_xks_preflight.py"]

[project.scripts]
llmd-xks-preflight = "llmd_xks_checks:main"
llmd-xks-preflight = "llmd_xks_preflight:main"
Loading