diff --git a/Makefile b/Makefile index 6cd9573d..8b64eac4 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ -# Include environment variables (skip for generate-env so it starts clean) -ifeq ($(filter generate-env,$(MAKECMDGOALS)),) +# Include environment variables (skip for targets that don't need a .env) +ifeq ($(filter generate-env validate-env-files help,$(MAKECMDGOALS)),) include .env export endif @@ -35,7 +35,7 @@ WORKER_SCRIPT := scripts/worker.sh delete-dpf-hcp-provisioner-operator \ verify-deployment verify-workers verify-dpu-nodes verify-dpudeployment \ run-traffic-flow-tests tft-setup tft-cleanup tft-show-config tft-results \ - generate-env + validate-env-files generate-env all: @mkdir -p logs @@ -268,8 +268,42 @@ verify-dpu-nodes: verify-dpudeployment: @$(VERIFY_SCRIPT) verify-dpudeployment +validate-env-files: + @bash -c '\ + set -e; \ + defaults=$$(grep -oP "^\w+" ci/env.defaults | sort); \ + template=$$(grep -oP "^\w+" ci/env.template | sort); \ + required=$$(grep -oP "\w+(?=:)" ci/env.required | sort); \ + known=$$( echo "$$defaults"; echo "$$required" ); \ + missing=""; \ + for var in $$defaults; do \ + if ! echo "$$template" | grep -qx "$$var"; then \ + missing="$$missing $$var"; \ + fi; \ + done; \ + extra=""; \ + for var in $$template; do \ + if ! echo "$$known" | grep -qx "$$var"; then \ + extra="$$extra $$var"; \ + fi; \ + done; \ + if [ -n "$$missing" ]; then \ + echo "ERROR: variables in ci/env.defaults that are missing from ci/env.template:"; \ + for var in $$missing; do echo " - $$var"; done; \ + echo ""; \ + echo "These variables will be silently dropped from .env."; \ + echo "Fix: add a line VAR_NAME=\$${VAR_NAME} to ci/env.template for each."; \ + exit 1; \ + fi; \ + if [ -n "$$extra" ]; then \ + count=$$(echo $$extra | wc -w | tr -d " "); \ + echo "OK $$count template-only variable(s) have no default (set per-environment):$${extra}"; \ + fi; \ + echo "OK all ci/env.defaults variables are present in ci/env.template"; \ + ' + FORCE ?= false -generate-env: +generate-env: validate-env-files @if [ -f .env ] && [ "$(FORCE)" != "true" ]; then \ echo "ERROR: .env already exists. To overwrite, run: make generate-env FORCE=true"; \ exit 1; \ @@ -441,5 +475,5 @@ help: @echo " TFT_DURATION - Duration per test in seconds (default: 10)" @echo " TFT_CONNECTION_TYPE - Test type: iperf-tcp, iperf-udp, etc. (default: iperf-tcp)" @echo " TFT_KUBECONFIG - Path to cluster kubeconfig" - @echo " TFT_SERVER_NODE - K8s node name for server (default: from HBN_HOSTNAME_NODE1)" - @echo " TFT_CLIENT_NODE - K8s node name for client (default: from HBN_HOSTNAME_NODE2)" + @echo " TFT_SERVER_NODE - K8s node name for server (default: from WORKER_1_NAME)" + @echo " TFT_CLIENT_NODE - K8s node name for client (default: from WORKER_2_NAME)" diff --git a/ci/env.defaults b/ci/env.defaults index 4d83eba4..afc51745 100644 --- a/ci/env.defaults +++ b/ci/env.defaults @@ -7,7 +7,8 @@ # Cluster Configuration OPENSHIFT_VERSION=${OPENSHIFT_VERSION:-4.20.4} -USE_V419_WORKAROUND=${USE_V419_WORKAROUND:-false} +CATALOG_SOURCE_NAME=${CATALOG_SOURCE_NAME:-redhat-operators} +CATALOG_SOURCE_IMAGE=${CATALOG_SOURCE_IMAGE:-} # Bridge Configuration BRIDGE_NAME=${BRIDGE_NAME:-mgmt-br} @@ -31,28 +32,58 @@ OVN_KUBERNETES_IMAGE_TAG=${OVN_KUBERNETES_IMAGE_TAG:-de9b16e0bb85c3b5727d5250124 OVN_KUBERNETES_UTILS_IMAGE_REPO=${OVN_KUBERNETES_UTILS_IMAGE_REPO:-ghcr.io/mellanox/ovn-kubernetes-dpf-utils} OVN_KUBERNETES_UTILS_IMAGE_TAG=${OVN_KUBERNETES_UTILS_IMAGE_TAG:-v25.7.1-cff70b1} INJECTOR_RESOURCE_NAME=${INJECTOR_RESOURCE_NAME:-openshift.io/bf3-p0-vfs} +INJECTOR_CHART_VERSION=${INJECTOR_CHART_VERSION:-v25.7.1-cff70b1} +OVNK_NAMESPACE=${OVNK_NAMESPACE:-openshift-ovn-kubernetes} +NUM_VFS=${NUM_VFS:-46} # GitOps Operator Configuration GITOPS_OPERATOR_CHANNEL=${GITOPS_OPERATOR_CHANNEL:-1.16} GITOPS_OPERATOR_VERSION=${GITOPS_OPERATOR_VERSION:-v1.16.4} +# Maintenance Operator Configuration +MAINTENANCE_OPERATOR_VERSION=${MAINTENANCE_OPERATOR_VERSION:-0.2.0} + # Hypershift Configuration HYPERSHIFT_IMAGE=${HYPERSHIFT_IMAGE:-quay.io/hypershift/hypershift-operator:latest} HOSTED_CLUSTER_NAME=${HOSTED_CLUSTER_NAME:-doca} CLUSTERS_NAMESPACE=${CLUSTERS_NAMESPACE:-clusters} -HOSTED_CONTROL_PLANE_NAMESPACE=${HOSTED_CONTROL_PLANE_NAMESPACE:-clusters-doca} OCP_RELEASE_IMAGE=${OCP_RELEASE_IMAGE:-quay.io/openshift-release-dev/ocp-release:4.20.4-multi} DISABLE_HCP_CAPS=${DISABLE_HCP_CAPS:-false} +ENABLE_HCP_MULTUS=${ENABLE_HCP_MULTUS:-true} DPF_CLUSTER_TYPE=${DPF_CLUSTER_TYPE:-hypershift} # Network Configuration POD_CIDR=${POD_CIDR:-10.128.0.0/14} SERVICE_CIDR=${SERVICE_CIDR:-172.30.0.0/16} +API_VIP=${API_VIP:-10.8.2.100} +INGRESS_VIP=${INGRESS_VIP:-10.8.2.101} +HBN_OVN_NETWORK=${HBN_OVN_NETWORK:-10.0.120.0/22} +NODES_MTU=${NODES_MTU:-1500} +PRIMARY_IFACE=${PRIMARY_IFACE:-enp1s0} + +# MetalLB Configuration +HYPERSHIFT_API_IP=${HYPERSHIFT_API_IP:-} # Pull Secrets OPENSHIFT_PULL_SECRET=${OPENSHIFT_PULL_SECRET:-openshift_pull.json} DPF_PULL_SECRET=${DPF_PULL_SECRET:-pull-secret.txt} +# NFD Configuration +NFD_OPERAND_IMAGE=${NFD_OPERAND_IMAGE:-quay.io/itsoiref/nfd:latest} + +# HBN DPU Services Configuration +HBN_HELM_REPO_URL=${HBN_HELM_REPO_URL:-https://helm.ngc.nvidia.com/nvidia/doca} +HBN_HELM_CHART_VERSION=${HBN_HELM_CHART_VERSION:-1.0.3} +HBN_IMAGE_REPO=${HBN_IMAGE_REPO:-nvcr.io/nvidia/doca/doca_hbn} +HBN_IMAGE_TAG=${HBN_IMAGE_TAG:-3.2.0-doca3.2.0} + +# DTS Service Configuration +DTS_HELM_REPO_URL=${DTS_HELM_REPO_URL:-https://helm.ngc.nvidia.com/nvidia/doca} +DTS_HELM_CHART_VERSION=${DTS_HELM_CHART_VERSION:-1.22.1} + +# DMS Hostagent Configuration +DMS_HOSTAGENT_IMAGE=${DMS_HOSTAGENT_IMAGE:-ghcr.io/killianmuldoon/hostdriver:v25.10.1-patch.1} + # VM Configuration VM_PREFIX=${VM_PREFIX:-vm-dpf} VM_COUNT=${VM_COUNT:-3} @@ -61,6 +92,7 @@ VCPUS=${VCPUS:-14} DISK_SIZE1=${DISK_SIZE1:-120} DISK_SIZE2=${DISK_SIZE2:-80} MAC_PREFIX=${MAC_PREFIX:-52:54:00:ee:42} +VM_STATIC_IP=${VM_STATIC_IP:-false} # Wait Configuration MAX_RETRIES=${MAX_RETRIES:-90} @@ -68,13 +100,19 @@ SLEEP_TIME=${SLEEP_TIME:-60} # Paths DISK_PATH=${DISK_PATH:-/var/lib/libvirt/images} -ISO_FOLDER=${ISO_FOLDER:-/var/lib/libvirt/images} +ISO_FOLDER=${ISO_FOLDER:-${DISK_PATH}} +ISO_TYPE=${ISO_TYPE:-minimal} STATIC_NET_FILE=${STATIC_NET_FILE:-./configuration_templates/static_net.yaml} # Storage STORAGE_TYPE=${STORAGE_TYPE:-lvm} +SKIP_DEPLOY_STORAGE=${SKIP_DEPLOY_STORAGE:-false} BFB_STORAGE_CLASS=${BFB_STORAGE_CLASS:-lvms-vg1} +BFB_URL=${BFB_URL:-http://10.8.2.236/bfb/rhcos_4.19.0-ec.4_installer_2025-04-23_07-48-42.bfb} +# NFS Configuration +NFS_SERVER_NODE_IP=${NFS_SERVER_NODE_IP:-} +NFS_PATH=${NFS_PATH:-/} # Kubeconfig KUBECONFIG=${KUBECONFIG:-./kubeconfig} @@ -83,6 +121,7 @@ KUBECONFIG=${KUBECONFIG:-./kubeconfig} AUTO_APPROVE_WORKER_CSR=${AUTO_APPROVE_WORKER_CSR:-false} AUTO_APPROVE_DPUCLUSTER_CSR=${AUTO_APPROVE_DPUCLUSTER_CSR:-false} WORKER_COUNT=${WORKER_COUNT:-0} +ENABLE_SHORT_WORKER_HOSTNAMES=${ENABLE_SHORT_WORKER_HOSTNAMES:-false} # Verification VERIFY_DEPLOYMENT=${VERIFY_DEPLOYMENT:-false} @@ -100,3 +139,13 @@ SANITY_TESTS_PODS_WORKLOAD_FILE=${SANITY_TESTS_PODS_WORKLOAD_FILE:-manifests/pos SANITY_TESTS_WORKLOAD_NAMESPACE=${SANITY_TESTS_WORKLOAD_NAMESPACE:-workload} SANITY_TESTS_PING_COUNT=${SANITY_TESTS_PING_COUNT:-20} SANITY_TESTS_PING_HBN_TO_HBN_PODS=${SANITY_TESTS_PING_HBN_TO_HBN_PODS:-false} + +# DPF HCP Provisioner Operator Configuration +DPF_HCP_PROVISIONER_OPERATOR_CHART_URL=${DPF_HCP_PROVISIONER_OPERATOR_CHART_URL:-oci://quay.io/lhadad/charts/dpf-hcp-provisioner-operator} +DPF_HCP_PROVISIONER_OPERATOR_NAMESPACE=${DPF_HCP_PROVISIONER_OPERATOR_NAMESPACE:-dpf-hcp-provisioner-system} +DPF_HCP_PROVISIONER_OPERATOR_VERSION=${DPF_HCP_PROVISIONER_OPERATOR_VERSION:-0.1.2} +DPF_HCP_PROVISIONER_OPERATOR_IMAGE_REPO=${DPF_HCP_PROVISIONER_OPERATOR_IMAGE_REPO:-quay.io/lhadad/dpf-hcp-provisioner-operator} +DPF_HCP_PROVISIONER_OPERATOR_IMAGE_TAG=${DPF_HCP_PROVISIONER_OPERATOR_IMAGE_TAG:-v0.1.2} +DPFHCPPROVISIONER_PULL_SECRET_NAME=${DPFHCPPROVISIONER_PULL_SECRET_NAME:-my-pull-secret} +DPFHCPPROVISIONER_SSH_SECRET_NAME=${DPFHCPPROVISIONER_SSH_SECRET_NAME:-my-ssh-key} +ENABLE_BLUEFIELD_VALIDATION=${ENABLE_BLUEFIELD_VALIDATION:-false} diff --git a/ci/env.required b/ci/env.required index 83c7ca75..dd4da334 100644 --- a/ci/env.required +++ b/ci/env.required @@ -14,7 +14,6 @@ # Network / DPU : ${API_VIP:?API_VIP must be set} : ${INGRESS_VIP:?INGRESS_VIP must be set} -: ${DPU_INTERFACE:?DPU_INTERFACE must be set (e.g. ens5f0np0)} : ${DPU_HOST_CIDR:?DPU_HOST_CIDR must be set (e.g. 10.0.110.0/24)} : ${HBN_OVN_NETWORK:?HBN_OVN_NETWORK must be set (e.g. 10.0.120.0/22)} @@ -25,8 +24,6 @@ : ${BFB_URL:?BFB_URL must be set} # HBN DPUServices -: ${HBN_HOSTNAME_NODE1:?HBN_HOSTNAME_NODE1 must be set} -: ${HBN_HOSTNAME_NODE2:?HBN_HOSTNAME_NODE2 must be set} : ${HBN_HELM_REPO_URL:?HBN_HELM_REPO_URL must be set} : ${HBN_HELM_CHART_VERSION:?HBN_HELM_CHART_VERSION must be set} : ${HBN_IMAGE_REPO:?HBN_IMAGE_REPO must be set} diff --git a/ci/env.template b/ci/env.template index a9b6c53b..5ae26198 100644 --- a/ci/env.template +++ b/ci/env.template @@ -5,7 +5,8 @@ CLUSTER_NAME=${CLUSTER_NAME} BASE_DOMAIN=${BASE_DOMAIN} OPENSHIFT_VERSION=${OPENSHIFT_VERSION} -USE_V419_WORKAROUND=${USE_V419_WORKAROUND} +CATALOG_SOURCE_NAME=${CATALOG_SOURCE_NAME} +CATALOG_SOURCE_IMAGE=${CATALOG_SOURCE_IMAGE} # Bridge Configuration BRIDGE_NAME=${BRIDGE_NAME} @@ -28,41 +29,60 @@ OVN_KUBERNETES_IMAGE_REPO=${OVN_KUBERNETES_IMAGE_REPO} OVN_KUBERNETES_IMAGE_TAG=${OVN_KUBERNETES_IMAGE_TAG} OVN_KUBERNETES_UTILS_IMAGE_REPO=${OVN_KUBERNETES_UTILS_IMAGE_REPO} OVN_KUBERNETES_UTILS_IMAGE_TAG=${OVN_KUBERNETES_UTILS_IMAGE_TAG} +INJECTOR_RESOURCE_NAME=${INJECTOR_RESOURCE_NAME} +INJECTOR_CHART_VERSION=${INJECTOR_CHART_VERSION} +OVNK_NAMESPACE=${OVNK_NAMESPACE} +NUM_VFS=${NUM_VFS} # GitOps Operator Configuration GITOPS_OPERATOR_CHANNEL=${GITOPS_OPERATOR_CHANNEL} GITOPS_OPERATOR_VERSION=${GITOPS_OPERATOR_VERSION} +# Maintenance Operator Configuration +MAINTENANCE_OPERATOR_VERSION=${MAINTENANCE_OPERATOR_VERSION} + # Hypershift Configuration HYPERSHIFT_IMAGE=${HYPERSHIFT_IMAGE} HOSTED_CLUSTER_NAME=${HOSTED_CLUSTER_NAME} CLUSTERS_NAMESPACE=${CLUSTERS_NAMESPACE} -HOSTED_CONTROL_PLANE_NAMESPACE=${HOSTED_CONTROL_PLANE_NAMESPACE} OCP_RELEASE_IMAGE=${OCP_RELEASE_IMAGE} DISABLE_HCP_CAPS=${DISABLE_HCP_CAPS} +ENABLE_HCP_MULTUS=${ENABLE_HCP_MULTUS} DPF_CLUSTER_TYPE=${DPF_CLUSTER_TYPE} # Network Configuration POD_CIDR=${POD_CIDR} SERVICE_CIDR=${SERVICE_CIDR} DPU_HOST_CIDR=${DPU_HOST_CIDR} -HBN_OVN_NETWORK=${HBN_OVN_NETWORK} API_VIP=${API_VIP} INGRESS_VIP=${INGRESS_VIP} -INJECTOR_RESOURCE_NAME=${INJECTOR_RESOURCE_NAME} +HBN_OVN_NETWORK=${HBN_OVN_NETWORK} +NODES_MTU=${NODES_MTU} +PRIMARY_IFACE=${PRIMARY_IFACE} + +# MetalLB Configuration +HYPERSHIFT_API_IP=${HYPERSHIFT_API_IP} # Pull Secret files OPENSHIFT_PULL_SECRET=${OPENSHIFT_PULL_SECRET} DPF_PULL_SECRET=${DPF_PULL_SECRET} +# NFD Configuration +NFD_OPERAND_IMAGE=${NFD_OPERAND_IMAGE} + # DPU Services Configuration -HBN_HOSTNAME_NODE1=${HBN_HOSTNAME_NODE1} -HBN_HOSTNAME_NODE2=${HBN_HOSTNAME_NODE2} HBN_HELM_REPO_URL=${HBN_HELM_REPO_URL} HBN_HELM_CHART_VERSION=${HBN_HELM_CHART_VERSION} HBN_IMAGE_REPO=${HBN_IMAGE_REPO} HBN_IMAGE_TAG=${HBN_IMAGE_TAG} +# DTS Service Configuration +DTS_HELM_REPO_URL=${DTS_HELM_REPO_URL} +DTS_HELM_CHART_VERSION=${DTS_HELM_CHART_VERSION} + +# DMS Hostagent Configuration +DMS_HOSTAGENT_IMAGE=${DMS_HOSTAGENT_IMAGE} + # VM Configuration VM_PREFIX=${VM_PREFIX} VM_COUNT=${VM_COUNT} @@ -71,6 +91,7 @@ VCPUS=${VCPUS} DISK_SIZE1=${DISK_SIZE1} DISK_SIZE2=${DISK_SIZE2} MAC_PREFIX=${MAC_PREFIX} +VM_STATIC_IP=${VM_STATIC_IP} # Wait Configuration MAX_RETRIES=${MAX_RETRIES} @@ -79,19 +100,26 @@ SLEEP_TIME=${SLEEP_TIME} # Paths DISK_PATH=${DISK_PATH} ISO_FOLDER=${ISO_FOLDER} +ISO_TYPE=${ISO_TYPE} STATIC_NET_FILE=${STATIC_NET_FILE} # Storage STORAGE_TYPE=${STORAGE_TYPE} +SKIP_DEPLOY_STORAGE=${SKIP_DEPLOY_STORAGE} BFB_STORAGE_CLASS=${BFB_STORAGE_CLASS} BFB_URL=${BFB_URL} +# NFS Configuration +NFS_SERVER_NODE_IP=${NFS_SERVER_NODE_IP} +NFS_PATH=${NFS_PATH} + # Kubeconfig KUBECONFIG=${KUBECONFIG} TARGETCLUSTER_API_SERVER_HOST=${TARGETCLUSTER_API_SERVER_HOST} # Worker Node Provisioning WORKER_COUNT=${WORKER_COUNT} +ENABLE_SHORT_WORKER_HOSTNAMES=${ENABLE_SHORT_WORKER_HOSTNAMES} # Worker 1 WORKER_1_NAME=${WORKER_1_NAME} @@ -131,3 +159,13 @@ SANITY_TESTS_PODS_WORKLOAD_FILE=${SANITY_TESTS_PODS_WORKLOAD_FILE} SANITY_TESTS_WORKLOAD_NAMESPACE=${SANITY_TESTS_WORKLOAD_NAMESPACE} SANITY_TESTS_PING_COUNT=${SANITY_TESTS_PING_COUNT} SANITY_TESTS_PING_HBN_TO_HBN_PODS=${SANITY_TESTS_PING_HBN_TO_HBN_PODS} + +# DPF HCP Provisioner Operator Configuration +DPF_HCP_PROVISIONER_OPERATOR_CHART_URL=${DPF_HCP_PROVISIONER_OPERATOR_CHART_URL} +DPF_HCP_PROVISIONER_OPERATOR_NAMESPACE=${DPF_HCP_PROVISIONER_OPERATOR_NAMESPACE} +DPF_HCP_PROVISIONER_OPERATOR_VERSION=${DPF_HCP_PROVISIONER_OPERATOR_VERSION} +DPF_HCP_PROVISIONER_OPERATOR_IMAGE_REPO=${DPF_HCP_PROVISIONER_OPERATOR_IMAGE_REPO} +DPF_HCP_PROVISIONER_OPERATOR_IMAGE_TAG=${DPF_HCP_PROVISIONER_OPERATOR_IMAGE_TAG} +DPFHCPPROVISIONER_PULL_SECRET_NAME=${DPFHCPPROVISIONER_PULL_SECRET_NAME} +DPFHCPPROVISIONER_SSH_SECRET_NAME=${DPFHCPPROVISIONER_SSH_SECRET_NAME} +ENABLE_BLUEFIELD_VALIDATION=${ENABLE_BLUEFIELD_VALIDATION} diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md index 7e7a23c4..ed14e8b2 100644 --- a/docs/user-guide/configuration.md +++ b/docs/user-guide/configuration.md @@ -73,7 +73,6 @@ OVN_CHART_VERSION=v25.7.1-f073927 # Matches DPF version ```bash # DPU Interface Settings -DPU_INTERFACE=ens7f0np0 # Physical DPU interface NUM_VFS=46 # Number of SR-IOV VFs DPU_HOST_CIDR=10.6.130.0/24 # DPU host network HBN_OVN_NETWORK=10.6.150.0/27 # HBN network range diff --git a/docs/user-guide/deployment-scenarios.md b/docs/user-guide/deployment-scenarios.md index 9671f4c9..a7bb7b81 100644 --- a/docs/user-guide/deployment-scenarios.md +++ b/docs/user-guide/deployment-scenarios.md @@ -192,7 +192,6 @@ WORKER_2_BOOT_MAC=aa:bb:cc:dd:ee:02 WORKER_2_ROOT_DEVICE=/dev/sda # DPU Configuration -DPU_INTERFACE=ens7f0np0 # Physical DPU interface NUM_VFS=46 # Number of virtual functions ``` diff --git a/docs/user-guide/troubleshooting.md b/docs/user-guide/troubleshooting.md index 61784561..cbb6493e 100644 --- a/docs/user-guide/troubleshooting.md +++ b/docs/user-guide/troubleshooting.md @@ -157,9 +157,8 @@ ip link show | grep ens7f0 # Common fixes: # 1. Wait for SR-IOV operator to configure interfaces (10+ minutes) -# 2. Verify DPU_INTERFACE setting in .env -# 3. Check DPU hardware is properly installed -# 4. Verify NUM_VFS configuration +# 2. Check DPU hardware is properly installed +# 3. Verify NUM_VFS configuration ``` ## Storage Issues diff --git a/manifests/cluster-installation/4.19-cataloguesource.yaml b/manifests/cluster-installation/custom-catalogsource.yaml similarity index 63% rename from manifests/cluster-installation/4.19-cataloguesource.yaml rename to manifests/cluster-installation/custom-catalogsource.yaml index 80a86794..3b45c54e 100644 --- a/manifests/cluster-installation/4.19-cataloguesource.yaml +++ b/manifests/cluster-installation/custom-catalogsource.yaml @@ -1,11 +1,11 @@ apiVersion: operators.coreos.com/v1alpha1 kind: CatalogSource metadata: - name: redhat-operators-v419 + name: namespace: openshift-marketplace spec: - displayName: Red Hat Operators v4.19 - image: registry.redhat.io/redhat/redhat-operator-index:v4.19 + displayName: + image: priority: -100 publisher: Red Hat sourceType: grpc diff --git a/scripts/env.sh b/scripts/env.sh index bf73b130..d4e00786 100755 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -51,199 +51,25 @@ if [ -z "${MAKELEVEL:-}" ]; then validate_mtu fi -# Directory Configuration -MANIFESTS_DIR=${MANIFESTS_DIR:-"manifests"} -GENERATED_DIR=${GENERATED_DIR:-"$MANIFESTS_DIR/generated"} -POST_INSTALL_DIR="${MANIFESTS_DIR}/post-installation" -GENERATED_POST_INSTALL_DIR="${GENERATED_DIR}/post-install" +# Computed / conditional variables — derived from .env values at runtime. HELM_CHARTS_DIR=${HELM_CHARTS_DIR:-"$MANIFESTS_DIR/helm-charts-values"} - -# BFB Configuration -BFB_URL=${BFB_URL:-"http://10.8.2.236/bfb/rhcos_4.19.0-ec.4_installer_2025-04-23_07-48-42.bfb"} - -# HBN OVN Configuration -HBN_OVN_NETWORK=${HBN_OVN_NETWORK:-"10.0.120.0/22"} - -# HBN Service Template Configuration -HBN_HELM_REPO_URL=${HBN_HELM_REPO_URL:-"https://helm.ngc.nvidia.com/nvidia/doca"} -HBN_HELM_CHART_VERSION=${HBN_HELM_CHART_VERSION:-"1.0.3"} -HBN_IMAGE_REPO=${HBN_IMAGE_REPO:-"nvcr.io/nvidia/doca/doca_hbn"} -HBN_IMAGE_TAG=${HBN_IMAGE_TAG:-"3.2.0-doca3.2.0"} - -# DTS Service Template Configuration -DTS_HELM_REPO_URL=${DTS_HELM_REPO_URL:-"https://helm.ngc.nvidia.com/nvidia/doca"} -DTS_HELM_CHART_VERSION=${DTS_HELM_CHART_VERSION:-"1.22.1"} - -# Cluster Configuration -CLUSTER_NAME=${CLUSTER_NAME:-"doca"} -BASE_DOMAIN=${BASE_DOMAIN:-"lab.nvidia.com"} -OPENSHIFT_VERSION=${OPENSHIFT_VERSION:-"4.14.0"} -KUBECONFIG=${KUBECONFIG:-"./${CLUSTER_NAME}-kubeconfig"} -SSH_KEY=${SSH_KEY:-"$HOME/.ssh/id_rsa.pub"} - -# Network Configuration -POD_CIDR=${POD_CIDR:-"10.128.0.0/14"} -SERVICE_CIDR=${SERVICE_CIDR:-"172.30.0.0/16"} -API_VIP=${API_VIP:-"10.8.2.100"} -INGRESS_VIP=${INGRESS_VIP:-"10.8.2.101"} - -# VM Configuration -VM_COUNT=${VM_COUNT:-"3"} -RAM=${RAM:-"41984"} -VCPUS=${VCPUS:-"14"} -DISK_SIZE1=${DISK_SIZE1:-"120"} -DISK_SIZE2=${DISK_SIZE2:-"80"} -VM_PREFIX=${VM_PREFIX:-"vm-dpf"} -VM_STATIC_IP=${VM_STATIC_IP:-"false"} - -# MAC Address Configuration -MAC_PREFIX=${MAC_PREFIX:-""} # If set, use custom-prefix method, otherwise use machine-id - -# Paths -DISK_PATH=${DISK_PATH:-"/var/lib/libvirt/images"} -ISO_FOLDER=${ISO_FOLDER:-${DISK_PATH}} -ISO_TYPE=${ISO_TYPE:-"minimal"} - -BRIDGE_NAME=${BRIDGE_NAME:-br0} -SKIP_BRIDGE_CONFIG=${SKIP_BRIDGE_CONFIG:-"false"} - -# DPF Configuration -DPF_VERSION=${DPF_VERSION:-"v25.7.1"} - -# DMS Hostagent Image Override (for DNS policy workaround) -# This image is used to patch hostagent pods with the dnsPolicy fix -DMS_HOSTAGENT_IMAGE=${DMS_HOSTAGENT_IMAGE:-"ghcr.io/killianmuldoon/hostdriver:v25.10.1-patch.1"} - -# Helm Chart URLs - OCI registry format for v25.7+ -DPF_HELM_REPO_URL=${DPF_HELM_REPO_URL:-"https://helm.ngc.nvidia.com/nvidia/doca"} -OVN_CHART_URL=${OVN_CHART_URL:-"oci://ghcr.io/mellanox/charts"} -OVN_TEMPLATE_CHART_URL=${OVN_TEMPLATE_CHART_URL:-${OVN_CHART_URL}} - -# OVN Image Configuration -OVN_KUBERNETES_IMAGE_REPO=${OVN_KUBERNETES_IMAGE_REPO:-"quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256"} -OVN_KUBERNETES_IMAGE_TAG=${OVN_KUBERNETES_IMAGE_TAG:-"780d11fac73412276b312b3f7c879b5e63da9687c7c8e79fc142e9c6e2f7c4cf"} - -# OVN-Kubernetes DPF Utils Image Configuration -# These are optional - if not set in .env, the imagedpf section will be omitted from ovn-template.yaml -OVN_KUBERNETES_UTILS_IMAGE_REPO=${OVN_KUBERNETES_UTILS_IMAGE_REPO:-""} -OVN_KUBERNETES_UTILS_IMAGE_TAG=${OVN_KUBERNETES_UTILS_IMAGE_TAG:-""} - -OVN_CHART_VERSION=${OVN_CHART_VERSION:-${DPF_VERSION}} -INJECTOR_CHART_VERSION=${INJECTOR_CHART_VERSION:-${OVN_CHART_VERSION}} - -# OVN-Kubernetes Namespace -OVNK_NAMESPACE=${OVNK_NAMESPACE:-"openshift-ovn-kubernetes"} - -NFD_OPERAND_IMAGE=${NFD_OPERAND_IMAGE:-"quay.io/itsoiref/nfd:latest"} - HOST_CLUSTER_API=${HOST_CLUSTER_API:-"api.$CLUSTER_NAME.$BASE_DOMAIN"} +HOSTED_CONTROL_PLANE_NAMESPACE=${HOSTED_CONTROL_PLANE_NAMESPACE:-"${CLUSTERS_NAMESPACE}-${HOSTED_CLUSTER_NAME}"} -# NFS Configuration -# NFS_SERVER_NODE_IP: IP address of external NFS server -# - For VM_COUNT < 3: Uses internal NFS (HOST_CLUSTER_API), this variable is ignored -# - For VM_COUNT >= 3 with BFB_STORAGE_CLASS=nfs-client: MUST be set to external NFS server IP -# NFS_PATH: Path exported by NFS server. Defaults to "/" -NFS_SERVER_NODE_IP=${NFS_SERVER_NODE_IP:-""} -NFS_PATH=${NFS_PATH:-"/"} - -# Storage Configuration -# SKIP_DEPLOY_STORAGE: When true, do not deploy LSO/LVM/ODF; use existing StorageClasses. -# You must set ETCD_STORAGE_CLASS to a StorageClass that already exists in the cluster. -# Validation runs after cluster install to ensure the StorageClass exists. -SKIP_DEPLOY_STORAGE=${SKIP_DEPLOY_STORAGE:-"false"} - -# STORAGE_TYPE: Choose storage backend for Hypershift etcd (ignored when SKIP_DEPLOY_STORAGE=true) -# - lvm: Logical Volume Manager Storage (default, works for SNO and MNO) -# - odf: OpenShift Data Foundation (multi-node only, requires 3+ nodes) -STORAGE_TYPE=${STORAGE_TYPE:-"lvm"} - -# Validate ODF requires at least 3 nodes +# Storage class — conditional on STORAGE_TYPE and SKIP_DEPLOY_STORAGE if [ "${STORAGE_TYPE}" == "odf" ] && [ "${VM_COUNT}" -lt 3 ]; then echo "Warning: ODF requires at least 3 nodes. Falling back to LVM." >&2 STORAGE_TYPE="lvm" fi -# Set storage class based on STORAGE_TYPE (when not skipping storage deploy). -# When SKIP_DEPLOY_STORAGE=true: ETCD_STORAGE_CLASS is user-defined only (no default). -# Existence of the StorageClass in the cluster is validated after install (see cluster.sh validate_storage_classes_available). if [ "${SKIP_DEPLOY_STORAGE}" = "true" ]; then if [ -z "${ETCD_STORAGE_CLASS}" ]; then echo "Error: SKIP_DEPLOY_STORAGE=true requires ETCD_STORAGE_CLASS to be set in .env to your existing StorageClass name." >&2 echo "Create the StorageClass in the cluster (e.g. via your storage operator), then set ETCD_STORAGE_CLASS in .env." >&2 exit 1 fi - # Do not assign a default; user must define ETCD_STORAGE_CLASS in .env. elif [ "${STORAGE_TYPE}" == "odf" ]; then ETCD_STORAGE_CLASS=${ETCD_STORAGE_CLASS:-"ocs-storagecluster-ceph-rbd"} else ETCD_STORAGE_CLASS=${ETCD_STORAGE_CLASS:-"lvms-vg1"} fi -NUM_VFS=${NUM_VFS:-"46"} - -# Feature Configuration - -# GitOps Operator Configuration -GITOPS_OPERATOR_CHANNEL=${GITOPS_OPERATOR_CHANNEL:-"1.16"} -GITOPS_OPERATOR_VERSION=${GITOPS_OPERATOR_VERSION:-"v1.16.3"} - -# Maintenance Operator Configuration -MAINTENANCE_OPERATOR_VERSION=${MAINTENANCE_OPERATOR_VERSION:-"0.2.0"} - -# Hypershift Configuration -ENABLE_HCP_MULTUS=${ENABLE_HCP_MULTUS:-"true"} -HYPERSHIFT_IMAGE=${HYPERSHIFT_IMAGE:-"quay.io/hypershift/hypershift-operator:latest"} -HOSTED_CLUSTER_NAME=${HOSTED_CLUSTER_NAME:-"doca"} -CLUSTERS_NAMESPACE=${CLUSTERS_NAMESPACE:-"clusters"} -OCP_RELEASE_IMAGE=${OCP_RELEASE_IMAGE:-"quay.io/openshift-release-dev/ocp-release:4.20.4-x86_64"} -HOSTED_CONTROL_PLANE_NAMESPACE="${CLUSTERS_NAMESPACE}-${HOSTED_CLUSTER_NAME}" - - -# Wait Configuration -MAX_RETRIES=${MAX_RETRIES:-"90"} -SLEEP_TIME=${SLEEP_TIME:-"60"} - -# Worker Provisioning Configuration -# Enable short worker hostnames (sets hostname based on MAC address via MachineConfig) -ENABLE_SHORT_WORKER_HOSTNAMES=${ENABLE_SHORT_WORKER_HOSTNAMES:-"false"} - -# CSR Auto-Approval Configuration -# AUTO_APPROVE_WORKER_CSR: Deploy CronJob to auto-approve CSRs for host cluster workers -AUTO_APPROVE_WORKER_CSR=${AUTO_APPROVE_WORKER_CSR:-"false"} -# AUTO_APPROVE_DPUCLUSTER_CSR: [DEPRECATED] Deploy CronJob to auto-approve CSRs for DPUCluster nodes -# NOTE: DPF HCP Provisioner Operator (v0.1.2+) handles CSR approval automatically. -# This standalone CronJob approach is deprecated and only needed for legacy deployments. -AUTO_APPROVE_DPUCLUSTER_CSR=${AUTO_APPROVE_DPUCLUSTER_CSR:-"false"} -STATIC_NET_FILE=${STATIC_NET_FILE:-"./configuration_templates/static_net.yaml"} -NODES_MTU=${NODES_MTU:-"1500"} -PRIMARY_IFACE=${PRIMARY_IFACE:-enp1s0} - -# OLM Catalog Source Configuration -CATALOG_SOURCE_NAME=${CATALOG_SOURCE_NAME:-"redhat-operators"} - -USE_V419_WORKAROUND=${USE_V419_WORKAROUND:-"false"} - -if [[ "${USE_V419_WORKAROUND}" == "true" ]]; then - CATALOG_SOURCE_NAME="redhat-operators-v419" -else - CATALOG_SOURCE_NAME="redhat-operators" -fi - -# MetalLB Configuration (for multi-node clusters) -# HYPERSHIFT_API_IP: IP address for Hypershift API server LoadBalancer (required for multi-node with Hypershift) -HYPERSHIFT_API_IP=${HYPERSHIFT_API_IP:-""} - -# Default values For DPF sanity tests script -SANITY_TESTS_PODS_WORKLOAD_FILE=${SANITY_TESTS_PODS_WORKLOAD_FILE:-"manifests/post-installation-manual/workload.yaml"} -SANITY_TESTS_WORKLOAD_NAMESPACE=${SANITY_TESTS_WORKLOAD_NAMESPACE:-"workload"} -SANITY_TESTS_PING_COUNT=${SANITY_TESTS_PING_COUNT:-"20"} -SANITY_TESTS_PING_HBN_TO_HBN_PODS=${SANITY_TESTS_PING_HBN_TO_HBN_PODS:-"false"} - -# DPF HCP Provisioner Operator Configuration -DPF_HCP_PROVISIONER_OPERATOR_CHART_URL=${DPF_HCP_PROVISIONER_OPERATOR_CHART_URL:-"oci://quay.io/lhadad/charts/dpf-hcp-provisioner-operator"} -DPF_HCP_PROVISIONER_OPERATOR_NAMESPACE=${DPF_HCP_PROVISIONER_OPERATOR_NAMESPACE:-"dpf-hcp-provisioner-system"} -DPF_HCP_PROVISIONER_OPERATOR_VERSION=${DPF_HCP_PROVISIONER_OPERATOR_VERSION:-"0.1.2"} -DPF_HCP_PROVISIONER_OPERATOR_IMAGE_REPO=${DPF_HCP_PROVISIONER_OPERATOR_IMAGE_REPO:-"quay.io/lhadad/dpf-hcp-provisioner-operator"} -DPF_HCP_PROVISIONER_OPERATOR_IMAGE_TAG=${DPF_HCP_PROVISIONER_OPERATOR_IMAGE_TAG:-"v0.1.2"} -DPFHCPPROVISIONER_PULL_SECRET_NAME=${DPFHCPPROVISIONER_PULL_SECRET_NAME:-"my-pull-secret"} -DPFHCPPROVISIONER_SSH_SECRET_NAME=${DPFHCPPROVISIONER_SSH_SECRET_NAME:-"my-ssh-key"} -ENABLE_BLUEFIELD_VALIDATION=${ENABLE_BLUEFIELD_VALIDATION:-"false"} diff --git a/scripts/manifests.sh b/scripts/manifests.sh index 1a61c919..f503f989 100755 --- a/scripts/manifests.sh +++ b/scripts/manifests.sh @@ -134,9 +134,7 @@ function prepare_cluster_manifests() { "99-worker-bridge.yaml" ) - if [ "${USE_V419_WORKAROUND}" != "true" ]; then - excluded_files+=("4.19-cataloguesource.yaml") - fi + excluded_files+=("custom-catalogsource.yaml") # Copy all manifests except excluded files using utility function copy_manifests_with_exclusions "$MANIFESTS_DIR/cluster-installation" "$GENERATED_DIR" "${excluded_files[@]}" @@ -206,7 +204,6 @@ update_worker_manifest() { function deploy_core_operator_sources() { log [INFO] "Deploying NFD and SR-IOV subscriptions..." log [INFO] "Using catalog source: ${CATALOG_SOURCE_NAME}" - log [INFO] "Using v4.19 workaround: ${USE_V419_WORKAROUND}" mkdir -p "$GENERATED_DIR" @@ -216,14 +213,14 @@ function deploy_core_operator_sources() { "" "$CATALOG_SOURCE_NAME" apply_manifest "$GENERATED_DIR/nfd-subscription.yaml" true - if [[ "${USE_V419_WORKAROUND}" == "true" ]]; then - log [INFO] "Deploying v4.19 catalog source (workaround enabled)" - local catalog_file="$MANIFESTS_DIR/cluster-installation/4.19-cataloguesource.yaml" - if [ -f "$catalog_file" ]; then - apply_manifest "$catalog_file" true - fi - else - log [INFO] "Skipping v4.19 catalog source deployment (using standard OLM)" + if [[ -n "${CATALOG_SOURCE_IMAGE}" ]]; then + log [INFO] "Deploying custom catalog source: ${CATALOG_SOURCE_NAME} (image: ${CATALOG_SOURCE_IMAGE})" + update_file_multi_replace \ + "$MANIFESTS_DIR/cluster-installation/custom-catalogsource.yaml" \ + "$GENERATED_DIR/custom-catalogsource.yaml" \ + "" "$CATALOG_SOURCE_NAME" \ + "" "$CATALOG_SOURCE_IMAGE" + apply_manifest "$GENERATED_DIR/custom-catalogsource.yaml" true fi log [INFO] "Core operator sources deployed." diff --git a/scripts/traffic-flow-tests.sh b/scripts/traffic-flow-tests.sh index e6e7dbb8..281e6b09 100755 --- a/scripts/traffic-flow-tests.sh +++ b/scripts/traffic-flow-tests.sh @@ -45,11 +45,8 @@ TFT_KUBECONFIG="${TFT_KUBECONFIG:-$(pwd)/kubeconfig.${CLUSTER_NAME}}" # Node names for TFT (server and client) # These are the actual Kubernetes node names, NOT BareMetalHost names -# Priority: TFT_*_NODE > HBN_HOSTNAME_NODE* (minus wildcard) > WORKER_*_NAME -_hbn_node1="${HBN_HOSTNAME_NODE1%\*}" -_hbn_node2="${HBN_HOSTNAME_NODE2%\*}" -TFT_SERVER_NODE="${TFT_SERVER_NODE:-${_hbn_node1:-${WORKER_1_NAME}}}" -TFT_CLIENT_NODE="${TFT_CLIENT_NODE:-${_hbn_node2:-${WORKER_2_NAME}}}" +TFT_SERVER_NODE="${TFT_SERVER_NODE:-${WORKER_1_NAME}}" +TFT_CLIENT_NODE="${TFT_CLIENT_NODE:-${WORKER_2_NAME}}" # ----------------------------------------------------------------------------- # Ensure Python 3.11 is available (install if missing) @@ -182,7 +179,7 @@ generate_config() { # Validate required node names if [[ -z "${TFT_SERVER_NODE}" ]] || [[ -z "${TFT_CLIENT_NODE}" ]]; then log "ERROR" "TFT_SERVER_NODE and TFT_CLIENT_NODE must be set" - log "ERROR" "These are derived from HBN_HOSTNAME_NODE1/NODE2 or can be set directly" + log "ERROR" "Set TFT_SERVER_NODE/TFT_CLIENT_NODE or WORKER_1_NAME/WORKER_2_NAME" log "ERROR" "They should match actual Kubernetes node names (not BareMetalHost names)" return 1 fi @@ -399,8 +396,7 @@ show_config() { echo "" echo "Node name sources (priority order):" echo " 1. TFT_SERVER_NODE / TFT_CLIENT_NODE (if set)" - echo " 2. HBN_HOSTNAME_NODE1/2 (minus wildcard): ${HBN_HOSTNAME_NODE1:-} / ${HBN_HOSTNAME_NODE2:-}" - echo " 3. WORKER_1_NAME / WORKER_2_NAME: ${WORKER_1_NAME:-} / ${WORKER_2_NAME:-}" + echo " 2. WORKER_1_NAME / WORKER_2_NAME: ${WORKER_1_NAME:-} / ${WORKER_2_NAME:-}" echo "" echo "Excluded Test Cases (known failures):" echo " 4 - POD_TO_HOST_DIFF_NODE" @@ -453,14 +449,14 @@ case "${1:-}" in echo " TFT_DURATION - Duration per test in seconds (default: 10)" echo " TFT_CONNECTION_TYPE - Connection type: iperf-tcp, iperf-udp, etc. (default: iperf-tcp)" echo " TFT_KUBECONFIG - Path to cluster kubeconfig" - echo " TFT_SERVER_NODE - Kubernetes node name for server (default: from HBN_HOSTNAME_NODE1)" - echo " TFT_CLIENT_NODE - Kubernetes node name for client (default: from HBN_HOSTNAME_NODE2)" + echo " TFT_SERVER_NODE - Kubernetes node name for server (default: from WORKER_1_NAME)" + echo " TFT_CLIENT_NODE - Kubernetes node name for client (default: from WORKER_2_NAME)" echo " TFT_PYTHON - Python interpreter (default: python3.11)" echo "" echo "Note: Python 3.11 is required. If not installed, the script will attempt" echo " to install it automatically using dnf/yum/apt." echo "" - echo "Node names fallback: TFT_*_NODE > HBN_HOSTNAME_NODE* > WORKER_*_NAME" + echo "Node names fallback: TFT_*_NODE > WORKER_*_NAME" exit 1 ;; esac