diff --git a/Golang_RHEL_Dockerfile b/Golang_RHEL_Dockerfile deleted file mode 100644 index 015126a..0000000 --- a/Golang_RHEL_Dockerfile +++ /dev/null @@ -1,167 +0,0 @@ -# Common (multistage) args -ARG D_OS="rhel9.2" -ARG D_ARCH="x86_64" -ARG D_CONTAINER_VER="0" -ARG D_DOCA_VERSION="2.9.1" -ARG D_OFED_VERSION="24.10-1.1.4.0" -ARG D_KERNEL_VER="5.14.0-284.32.1.el9_2.x86_64" -ARG D_OFED_SRC_DOWNLOAD_PATH="/run/mellanox/src" -ARG OFED_SRC_LOCAL_DIR=${D_OFED_SRC_DOWNLOAD_PATH}/MLNX_OFED_SRC-${D_OFED_VERSION} - -# DTK base image (below example for specific kernel headers version) -ARG D_BASE_IMAGE="registry.access.redhat.com/ubi9/ubi:latest" -# Standart: registry.access.redhat.com/ubi9:latest - -ARG D_PYTHON_VERSION="36" -ARG D_PYTHON="python${D_PYTHON_VERSION}" - -# Final clean image of precompiled driver container -ARG D_FINAL_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:latest - -################################################################## -# Stage: build go binary for entrypoint -FROM golang:1.24 AS go_builder - -# Set GOPROXY if provided -ARG GOPROXY -ENV GOPROXY=$GOPROXY - -WORKDIR /workspace - -COPY entrypoint/go.mod go.mod -COPY entrypoint/go.sum go.sum - -RUN go mod download - -COPY entrypoint/ . - -RUN TARGETARCH=${D_ARCH} TARGETOS=linux make build - -################################################################## -# Stage: Minimal base image update and install common requirements - -FROM $D_BASE_IMAGE AS base - -# Inherited global args -ARG D_OS - -RUN set -x && \ -# Driver build / install script requirements - dnf -y install perl \ -# Container functional requirements - jq iproute kmod procps-ng udev - -COPY --from=go_builder /workspace/build/entrypoint /root/entrypoint -WORKDIR /root -ADD ./entrypoint.sh /root/entrypoint.sh -ADD ./loader.sh /root/loader.sh - -ENTRYPOINT ["/root/loader.sh"] - -############################################################################################## -# Stage: Download NVIDIA driver sources and install src driver container packages requirements - -FROM base AS driver-src - -# Inherited global args -ARG D_DOCA_VERSION -ARG D_OFED_VERSION -ARG D_CONTAINER_VER -ARG D_OFED_SRC_DOWNLOAD_PATH - -# Stage args -ARG D_OFED_BASE_URL="https://linux.mellanox.com/public/repo/doca/${D_DOCA_VERSION}/SOURCES/mlnx_ofed" -ARG D_OFED_SRC_TYPE="" - -ARG D_OFED_SRC_ARCHIVE="MLNX_OFED_SRC-${D_OFED_SRC_TYPE}${D_OFED_VERSION}.tgz" -ARG D_OFED_URL_PATH="${D_OFED_BASE_URL}/${D_OFED_SRC_ARCHIVE}" # although argument name says URL, local `*.tgz` compressed files may also be used (intended for internal use) - -ENV NVIDIA_NIC_DRIVER_VER=${D_OFED_VERSION} -ENV NVIDIA_NIC_CONTAINER_VER=${D_CONTAINER_VER} -ENV NVIDIA_NIC_DRIVER_PATH="${D_OFED_SRC_DOWNLOAD_PATH}/MLNX_OFED_SRC-${D_OFED_VERSION}" - -WORKDIR /root -RUN set -x && \ -# Install prerequirements - dnf install -y curl --allowerasing \ -# Driver build requirements - autoconf python3-devel ethtool automake pciutils libtool hostname dracut - -# Download NVIDIA NIC driver -RUN mkdir -p ${D_OFED_SRC_DOWNLOAD_PATH} -WORKDIR ${D_OFED_SRC_DOWNLOAD_PATH} -ADD ${D_OFED_URL_PATH} ${D_OFED_SRC_ARCHIVE} -RUN if file ${D_OFED_SRC_ARCHIVE} | grep compressed; then \ - tar -xzf ${D_OFED_SRC_ARCHIVE}; \ - else \ - mv ${D_OFED_SRC_ARCHIVE}/MLNX_OFED_SRC-${D_OFED_VERSION} . ; \ - fi - -WORKDIR /root -ADD ./entrypoint.sh /root/entrypoint.sh -ADD ./dtk_nic_driver_build.sh /root/dtk_nic_driver_build.sh - -ENTRYPOINT ["/root/loader.sh"] -CMD ["sources"] - -LABEL doca-version=${D_DOCA_VERSION} -LABEL ofed-version=${D_OFED_VERSION} - -##################### -# Stage: Build driver - -FROM driver-src AS driver-builder - -# Inherited global args -ARG D_OS -ARG D_KERNEL_VER -ARG OFED_SRC_LOCAL_DIR - -RUN set -x && \ -# MOFED installation requirements - dnf install -y autoconf gcc make rpm-build - -# Build driver -RUN set -x && \ - ${OFED_SRC_LOCAL_DIR}/install.pl --without-depcheck --distro ${D_OS} --kernel ${D_KERNEL_VER} --kernel-sources /lib/modules/${D_KERNEL_VER}/build --kernel-only --build-only --without-iser --without-srp --without-isert --without-knem --without-xpmem --with-mlnx-tools --with-ofed-scripts --copy-ifnames-udev - -################################### -# Stage: Install precompiled driver - -ARG D_FINAL_BASE_IMAGE - -FROM $D_FINAL_BASE_IMAGE AS precompiled - -# Inherited global args -ARG D_ARCH -ARG D_KERNEL_VER -ARG D_OFED_VERSION -ARG D_CONTAINER_VER -ARG OFED_SRC_LOCAL_DIR - -ENV NVIDIA_NIC_DRIVER_VER=${D_OFED_VERSION} -ENV NVIDIA_NIC_DRIVER_PATH="" -ENV NVIDIA_NIC_CONTAINER_VER=${D_CONTAINER_VER} - -COPY --from=driver-builder ${OFED_SRC_LOCAL_DIR}/RPMS/redhat-release-*/${D_ARCH}/*.rpm /root/ - -RUN rpm -ivh --nodeps /root/*.rpm - -RUN set -x && \ -# MOFED functional requirements - dnf install -y pciutils hostname udev ethtool \ -# Container functional requirements - jq iproute kmod procps-ng udev - -# Prevent modprobe from giving a WARNING about missing files -RUN touch /lib/modules/${D_KERNEL_VER}/modules.order /lib/modules/${D_KERNEL_VER}/modules.builtin && \ -# Introduce installed kernel modules - depmod ${D_KERNEL_VER} - -WORKDIR / -ADD ./entrypoint.sh /root/entrypoint.sh -ADD ./dtk_nic_driver_build.sh /root/dtk_nic_driver_build.sh - -ENTRYPOINT ["/root/loader.sh"] -CMD ["precompiled"] - diff --git a/RHEL_Dockerfile b/RHEL_Dockerfile index a213eed..a0e3605 100644 --- a/RHEL_Dockerfile +++ b/RHEL_Dockerfile @@ -12,9 +12,6 @@ ARG STIG_COMPLIANT=false # Final clean image of precompiled driver container ARG D_FINAL_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:latest -################################################################## -# Stage: Minimal base image update and install common requirements - # DTK base image (below example for specific kernel headers version) ARG D_BASE_IMAGE="registry.redhat.io/openshift4/driver-toolkit-rhel9:v4.13.0-202309112001.p0.gd719bdc.assembly.stream" # Standart: registry.access.redhat.com/ubi9:latest @@ -22,6 +19,28 @@ ARG D_BASE_IMAGE="registry.redhat.io/openshift4/driver-toolkit-rhel9:v4.13.0-202 ARG D_PYTHON_VERSION="36" ARG D_PYTHON="python${D_PYTHON_VERSION}" +################################################################## +# Stage: build go binary for entrypoint +FROM golang:1.24 AS go_builder + +# Set GOPROXY if provided +ARG GOPROXY +ENV GOPROXY=$GOPROXY + +WORKDIR /workspace + +COPY entrypoint/go.mod go.mod +COPY entrypoint/go.sum go.sum + +RUN go mod download + +COPY entrypoint/ . + +RUN TARGETARCH=${D_ARCH} TARGETOS=linux make build + +################################################################## +# Stage: Minimal base image update and install common requirements + FROM $D_BASE_IMAGE AS base # Inherited global args @@ -38,6 +57,13 @@ RUN set -x && \ # Container functional requirements jq iproute kmod procps-ng udev +COPY --from=go_builder /workspace/build/entrypoint /root/entrypoint +WORKDIR /root +ADD ./entrypoint.sh /root/entrypoint.sh +ADD ./loader.sh /root/loader.sh + +ENTRYPOINT ["/root/loader.sh"] + ############################################################################################## # Stage: Download NVIDIA driver sources and install src driver container packages requirements @@ -78,7 +104,7 @@ RUN if file ${D_OFED_SRC_ARCHIVE} | grep compressed; then \ mv ${D_OFED_SRC_ARCHIVE}/MLNX_OFED_SRC-${D_OFED_VERSION} . ; \ fi -WORKDIR / +WORKDIR /root ADD ./entrypoint.sh /root/entrypoint.sh ADD ./dtk_nic_driver_build.sh /root/dtk_nic_driver_build.sh @@ -92,7 +118,7 @@ RUN set -x && \ fi && \ rm -f /tmp/stig-fixer.sh -ENTRYPOINT ["/root/entrypoint.sh"] +ENTRYPOINT ["/root/loader.sh"] CMD ["sources"] LABEL doca-version=${D_DOCA_VERSION} @@ -149,9 +175,11 @@ RUN touch /lib/modules/${D_KERNEL_VER}/modules.order /lib/modules/${D_KERNEL_VER # Introduce installed kernel modules depmod ${D_KERNEL_VER} -WORKDIR / +WORKDIR /root ADD ./entrypoint.sh /root/entrypoint.sh +ADD ./loader.sh /root/loader.sh +COPY --from=go_builder /workspace/build/entrypoint /root/entrypoint ADD ./dtk_nic_driver_build.sh /root/dtk_nic_driver_build.sh -ENTRYPOINT ["/root/entrypoint.sh"] +ENTRYPOINT ["/root/loader.sh"] CMD ["precompiled"] diff --git a/THIRD_PARTY_NOTICES b/THIRD_PARTY_NOTICES index be7f070..67991ab 100644 --- a/THIRD_PARTY_NOTICES +++ b/THIRD_PARTY_NOTICES @@ -720,6 +720,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. See the License for the specific language governing permissions and limitations under the License. +--- +## go-shellquote + +Copyright (C) 2014 Kevin Ballard + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + --- ## entrypoint diff --git a/dtk_nic_driver_build.sh b/dtk_nic_driver_build.sh index af7ccfa..a8a9eab 100755 --- a/dtk_nic_driver_build.sh +++ b/dtk_nic_driver_build.sh @@ -1,6 +1,18 @@ #!/bin/bash # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Load environment variables if file exists +if [ -f "$(dirname "$0")/dtk.env" ]; then + source "$(dirname "$0")/dtk.env" +fi + +: ${USE_NEW_ENTRYPOINT:=false} + +if [ "$USE_NEW_ENTRYPOINT" = "true" ]; then + echo "Using Go entrypoint for DTK build" + exec "$(dirname "$0")/entrypoint" dtk-build +fi + : ${ENTRYPOINT_DEBUG:=false} : ${DTK_OCP_NIC_SHARED_DIR:=/mnt/shared-nvidia-nic-driver-toolkit} diff --git a/entrypoint/cmd/main.go b/entrypoint/cmd/main.go index 9c125c4..d691283 100644 --- a/entrypoint/cmd/main.go +++ b/entrypoint/cmd/main.go @@ -17,11 +17,13 @@ package main import ( + "context" "encoding/json" "flag" "fmt" "os" "os/signal" + "path/filepath" "syscall" "github.com/go-logr/logr" @@ -30,10 +32,39 @@ import ( "github.com/Mellanox/doca-driver-build/entrypoint/internal/config" "github.com/Mellanox/doca-driver-build/entrypoint/internal/constants" + "github.com/Mellanox/doca-driver-build/entrypoint/internal/dtk" "github.com/Mellanox/doca-driver-build/entrypoint/internal/entrypoint" + "github.com/Mellanox/doca-driver-build/entrypoint/internal/utils/cmd" "github.com/Mellanox/doca-driver-build/entrypoint/internal/version" ) +type ctxData struct { + //nolint:containedctx + Ctx context.Context + Cancel context.CancelFunc +} + +// setupSignalHandler takes a signal channel and contexts with cancel functions. +// It starts a goroutine that cancels the first uncanceled context on receiving a signal, +// if no uncanceled context exists, it exits the application with code 1. +func setupSignalHandler(ch chan os.Signal, ctxs []ctxData) { + go func() { + OUT: + for { + <-ch + for _, ctx := range ctxs { + if ctx.Ctx.Err() != nil { + // context is already canceled, try next one + continue + } + ctx.Cancel() + continue OUT + } + os.Exit(1) + } + }() +} + func main() { cfg, err := config.GetConfig() if err != nil { @@ -57,6 +88,20 @@ func main() { os.Exit(1) } log.Info("start manager", "mode", containerMode) + if containerMode == constants.DriverContainerModeDtkBuild { + // Use a context that is canceled on signal + ctx, cancel := context.WithCancel(context.Background()) + // Attach logger to context + ctx = logr.NewContext(ctx, log) + setupSignalHandler(getSignalChannel(), []ctxData{{Ctx: ctx, Cancel: cancel}}) + + if err := dtk.RunBuild(ctx, log, cfg, cmd.New()); err != nil { + log.Error(err, "DTK Build failed") + os.Exit(1) + } + return + } + if err := entrypoint.Run(getSignalChannel(), log, containerMode, cfg); err != nil { log.Error(err, "Entrypoint Run failed") os.Exit(1) @@ -67,9 +112,11 @@ func getContainerMode() (string, error) { flag.Parse() containerMode := flag.Arg(0) if flag.NArg() != 1 || - (containerMode != constants.DriverContainerModePrecompiled && containerMode != string(constants.DriverContainerModeSources)) { - return "", fmt.Errorf("container mode argument has invalid value %s, supported values: %s, %s", - containerMode, constants.DriverContainerModePrecompiled, constants.DriverContainerModeSources) + (containerMode != constants.DriverContainerModePrecompiled && + containerMode != constants.DriverContainerModeSources && + containerMode != constants.DriverContainerModeDtkBuild) { + return "", fmt.Errorf("container mode argument has invalid value %s, supported values: %s, %s, %s", + containerMode, constants.DriverContainerModePrecompiled, constants.DriverContainerModeSources, constants.DriverContainerModeDtkBuild) } return containerMode, nil } @@ -87,6 +134,11 @@ func getLogger(cfg config.Config) logr.Logger { if cfg.EntrypointDebug { logConfig.Level = zap.NewAtomicLevelAt(zap.DebugLevel) if cfg.DebugLogFile != "" { + // Create directory if it doesn't exist + logDir := filepath.Dir(cfg.DebugLogFile) + if err := os.MkdirAll(logDir, 0o755); err != nil { + fmt.Fprintf(os.Stderr, "WARNING: failed to create log directory %s: %v\n", logDir, err) + } logConfig.OutputPaths = append(logConfig.OutputPaths, cfg.DebugLogFile) logConfig.ErrorOutputPaths = append(logConfig.ErrorOutputPaths, cfg.DebugLogFile) } diff --git a/entrypoint/go.mod b/entrypoint/go.mod index 9e72f9b..62d8749 100644 --- a/entrypoint/go.mod +++ b/entrypoint/go.mod @@ -6,6 +6,7 @@ require ( github.com/go-logr/logr v1.4.3 github.com/gofrs/flock v0.13.0 github.com/k8snetworkplumbingwg/sriovnet v1.2.0 + github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 github.com/stretchr/testify v1.11.1 github.com/vishvananda/netlink v1.3.1 go.uber.org/zap v1.27.1 diff --git a/entrypoint/go.sum b/entrypoint/go.sum index 7d42f07..b40d032 100644 --- a/entrypoint/go.sum +++ b/entrypoint/go.sum @@ -147,6 +147,8 @@ github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/k8snetworkplumbingwg/sriovnet v1.2.0 h1:6ELfAxCB1dvosGUy3DVRmfH+HWTzmPD3W67HKQvMR1M= github.com/k8snetworkplumbingwg/sriovnet v1.2.0/go.mod h1:jyWzGe6ZtYiPq6ih6aXCOy6mZ49Y9mNyBOLBBXnli+k= +github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= +github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= diff --git a/entrypoint/internal/config/config.go b/entrypoint/internal/config/config.go index f7d4c4e..c9412ee 100644 --- a/entrypoint/internal/config/config.go +++ b/entrypoint/internal/config/config.go @@ -43,6 +43,10 @@ type Config struct { DtkOcpDriverBuild bool `env:"DTK_OCP_DRIVER_BUILD"` DtkOcpNicSharedDir string `env:"DTK_OCP_NIC_SHARED_DIR" envDefault:"/mnt/shared-nvidia-nic-driver-toolkit"` + DtkOcpCompiledDriverVer string `env:"DTK_OCP_COMPILED_DRIVER_VER"` + DtkOcpStartCompileFlag string `env:"DTK_OCP_START_COMPILE_FLAG"` + DtkOcpDoneCompileFlag string `env:"DTK_OCP_DONE_COMPILE_FLAG"` + AppendDriverBuildFlags string `env:"APPEND_DRIVER_BUILD_FLAGS"` NvidiaNicDriversInventoryPath string `env:"NVIDIA_NIC_DRIVERS_INVENTORY_PATH"` OfedBlacklistModulesFile string `env:"OFED_BLACKLIST_MODULES_FILE" envDefault:"/host/etc/modprobe.d/blacklist-ofed-modules.conf"` diff --git a/entrypoint/internal/constants/constants.go b/entrypoint/internal/constants/constants.go index 3f5a244..5a22f7a 100644 --- a/entrypoint/internal/constants/constants.go +++ b/entrypoint/internal/constants/constants.go @@ -21,6 +21,7 @@ const ( DriverContainerModeSources = "sources" DriverContainerModePrecompiled = "precompiled" + DriverContainerModeDtkBuild = "dtk-build" // OS Types OSTypeUbuntu = "ubuntu" @@ -33,4 +34,9 @@ const ( DefaultOpenShiftVersion = "4.9" InvalidGUID = "00:00:00:00:00:00:00:00" + + // DTK constants + DtkOcpBuildScriptPath = "/root/dtk_nic_driver_build.sh" + DtkStartCompileFlag = "dtk_start_compile" + DtkDoneCompileFlagPrefix = "dtk_done_compile_" ) diff --git a/entrypoint/internal/driver/driver.go b/entrypoint/internal/driver/driver.go index ba0684f..a271cfd 100644 --- a/entrypoint/internal/driver/driver.go +++ b/entrypoint/internal/driver/driver.go @@ -171,25 +171,38 @@ func (d *driverMgr) Build(ctx context.Context) error { // Mark build as incomplete at the start d.driverBuildIncomplete = true - // Create inventory directory - if err := d.createInventoryDirectory(ctx, inventoryPath); err != nil { - return fmt.Errorf("failed to create inventory directory: %w", err) - } + // Check if DTK OCP driver build is enabled + if d.cfg.DtkOcpDriverBuild { + if err := d.buildDriverDTK(ctx, kernelVersion, inventoryPath); err != nil { + return err + } + } else { + // Create inventory directory + if err := d.createInventoryDirectory(ctx, inventoryPath); err != nil { + return fmt.Errorf("failed to create inventory directory: %w", err) + } - // Install OS-specific prerequisites - log.V(1).Info("About to install prerequisites", "os", osType, "kernel", kernelVersion) - if err := d.installPrerequisitesForOS(ctx, osType, kernelVersion); err != nil { - return fmt.Errorf("failed to install prerequisites: %w", err) - } + // Install OS-specific prerequisites + log.V(1).Info("About to install prerequisites", "os", osType, "kernel", kernelVersion) + if err := d.installPrerequisitesForOS(ctx, osType, kernelVersion); err != nil { + return fmt.Errorf("failed to install prerequisites: %w", err) + } - // Build driver from source - if err := d.buildDriverFromSource(ctx, d.cfg.NvidiaNicDriverPath, kernelVersion, osType); err != nil { - return fmt.Errorf("failed to build driver from source: %w", err) - } + // Build driver from source + if err := d.buildDriverFromSource(ctx, d.cfg.NvidiaNicDriverPath, kernelVersion, osType); err != nil { + return fmt.Errorf("failed to build driver from source: %w", err) + } + + // Copy build artifacts to inventory + if err := d.copyBuildArtifacts(ctx, d.cfg.NvidiaNicDriverPath, inventoryPath, osType); err != nil { + return fmt.Errorf("failed to copy build artifacts: %w", err) + } - // Copy build artifacts to inventory - if err := d.copyBuildArtifacts(ctx, d.cfg.NvidiaNicDriverPath, inventoryPath, osType); err != nil { - return fmt.Errorf("failed to copy build artifacts: %w", err) + // Fix source link if needed + if err := d.fixSourceLink(ctx, kernelVersion); err != nil { + log.V(1).Info("Failed to fix source link", "error", err) + // Non-fatal error, continue + } } // Calculate and store checksum @@ -199,12 +212,6 @@ func (d *driverMgr) Build(ctx context.Context) error { } } - // Fix source link if needed - if err := d.fixSourceLink(ctx, kernelVersion); err != nil { - log.V(1).Info("Failed to fix source link", "error", err) - // Non-fatal error, continue - } - // Mark build as complete after successful build d.driverBuildIncomplete = false diff --git a/entrypoint/internal/driver/driver_dtk.go b/entrypoint/internal/driver/driver_dtk.go new file mode 100644 index 0000000..ff383c5 --- /dev/null +++ b/entrypoint/internal/driver/driver_dtk.go @@ -0,0 +1,254 @@ +/* + Copyright 2025, NVIDIA CORPORATION & AFFILIATES + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package driver + +import ( + "context" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + "time" + + "github.com/go-logr/logr" + + "github.com/Mellanox/doca-driver-build/entrypoint/internal/constants" +) + +// buildDriverDTK orchestrates the driver build using the OpenShift Driver Toolkit (DTK) +func (d *driverMgr) buildDriverDTK(ctx context.Context, kernelVersion, inventoryPath string) error { + log := logr.FromContextOrDiscard(ctx) + log.Info("Starting DTK driver build") + + // Sanitize kernel version for DTK shared directory + // Matches bash: DTK_KVER=$(echo "${FULL_KVER}" | sed 's/[^-A-Za-z0-9_.]/_/g' | sed 's/^[-_.]*//;s/[-_.]*$//') + dtkKver := sanitizeKernelVersion(kernelVersion) + dtkSharedDir := filepath.Join(d.cfg.DtkOcpNicSharedDir, dtkKver) + + // Construct done flag path + // Matches bash: DTK_OCP_DONE_COMPILE_FLAG="${DTK_OCP_DONE_COMPILE_FLAG_PREFIX}$(echo ${NVIDIA_NIC_DRIVER_VER} | sed 's/[.-]/_/g')" + verSanitized := strings.ReplaceAll(strings.ReplaceAll(d.cfg.NvidiaNicDriverVer, ".", "_"), "-", "_") + doneFlagName := constants.DtkDoneCompileFlagPrefix + verSanitized + doneFlagPath := filepath.Join(dtkSharedDir, doneFlagName) + startFlagPath := filepath.Join(dtkSharedDir, constants.DtkStartCompileFlag) + + // Check if build is already done + if _, err := d.os.Stat(doneFlagPath); os.IsNotExist(err) { + log.Info("DTK build not done, setting up build") + + if err := d.dtkSetupDriverBuild(ctx, dtkSharedDir, startFlagPath, doneFlagPath); err != nil { + return fmt.Errorf("failed to setup DTK build: %w", err) + } + + if err := d.dtkWaitForBuild(ctx, doneFlagPath); err != nil { + return fmt.Errorf("failed waiting for DTK build: %w", err) + } + } else { + log.Info("DTK build already done", "flag", doneFlagPath) + } + + // Finalize build (copy artifacts) + if err := d.dtkFinalizeDriverBuild(ctx, dtkSharedDir, inventoryPath); err != nil { + return fmt.Errorf("failed to finalize DTK build: %w", err) + } + + return nil +} + +// sanitizeKernelVersion sanitizes the kernel version string for use in directory names +func sanitizeKernelVersion(version string) string { + // Replace all non-alphanumeric characters (except -._) with underscore + reg := regexp.MustCompile(`[^-A-Za-z0-9_.]`) + sanitized := reg.ReplaceAllString(version, "_") + // Trim leading/trailing -._ + return strings.Trim(sanitized, "-._") +} + +// dtkSetupDriverBuild prepares the shared directory and script for DTK build +func (d *driverMgr) dtkSetupDriverBuild(ctx context.Context, sharedDir, startFlagPath, doneFlagPath string) error { + log := logr.FromContextOrDiscard(ctx) + log.Info("Setting up DTK driver build", "sharedDir", sharedDir) + + // Create shared directory + if err := d.os.MkdirAll(sharedDir, 0o755); err != nil { + return fmt.Errorf("failed to create shared directory: %w", err) + } + + // Copy driver sources to shared directory + // Matches bash: cp -r ${NVIDIA_NIC_DRIVER_PATH} ${DTK_OCP_NIC_SHARED_DIR}/ + srcDir := d.cfg.NvidiaNicDriverPath + // Use expected directory name format to ensure DTK build script finds it + expectedName := fmt.Sprintf("MLNX_OFED_SRC-%s", d.cfg.NvidiaNicDriverVer) + destDir := filepath.Join(sharedDir, expectedName) + + // Clean up destination if it exists to avoid nesting (cp -r behavior) + if err := d.os.RemoveAll(destDir); err != nil { + return fmt.Errorf("failed to clean up destination directory: %w", err) + } + + log.Info("Copying driver sources", "from", srcDir, "to", destDir) + if err := d.copyDir(ctx, srcDir, destDir); err != nil { + return fmt.Errorf("failed to copy driver sources: %w", err) + } + + // Copy entrypoint binary to shared directory + entrypointPath := "/root/entrypoint" // Assumed location based on Dockerfile + destEntrypointPath := filepath.Join(sharedDir, "entrypoint") + log.Info("Copying entrypoint binary", "from", entrypointPath, "to", destEntrypointPath) + // We use copyFile instead of copyDir for a single file + if _, _, err := d.cmd.RunCommand(ctx, "cp", entrypointPath, destEntrypointPath); err != nil { + return fmt.Errorf("failed to copy entrypoint binary: %w", err) + } + + // Create dtk.env file + // Get append flags + appendFlags := d.getAppendDriverBuildFlags(constants.OSTypeRedHat) + appendFlagsStr := strings.Join(appendFlags, " ") + + envContent := fmt.Sprintf(`export DTK_OCP_NIC_SHARED_DIR="%s" +export DTK_OCP_COMPILED_DRIVER_VER="%s" +export DTK_OCP_START_COMPILE_FLAG="%s" +export DTK_OCP_DONE_COMPILE_FLAG="%s" +export APPEND_DRIVER_BUILD_FLAGS="%s" +export USE_NEW_ENTRYPOINT="true" +export NVIDIA_NIC_DRIVER_VER="%s" +`, sharedDir, d.cfg.NvidiaNicDriverVer, startFlagPath, doneFlagPath, appendFlagsStr, d.cfg.NvidiaNicDriverVer) + + envPath := filepath.Join(sharedDir, "dtk.env") + if err := d.os.WriteFile(envPath, []byte(envContent), 0o644); err != nil { + return fmt.Errorf("failed to write dtk.env: %w", err) + } + + // Copy build script (loader) + srcScriptPath := constants.DtkOcpBuildScriptPath + destScriptPath := filepath.Join(sharedDir, filepath.Base(srcScriptPath)) + log.Info("Copying build script", "from", srcScriptPath, "to", destScriptPath) + // We use copyFile equivalent (run command cp) + if _, _, err := d.cmd.RunCommand(ctx, "cp", srcScriptPath, destScriptPath); err != nil { + return fmt.Errorf("failed to copy build script: %w", err) + } + + // Create start flag + log.Info("Creating start compile flag", "path", startFlagPath) + if _, err := d.os.Create(startFlagPath); err != nil { + return fmt.Errorf("failed to create start flag: %w", err) + } + + return nil +} + +// dtkWaitForBuild waits for the DTK build to complete +func (d *driverMgr) dtkWaitForBuild(ctx context.Context, doneFlagPath string) error { + log := logr.FromContextOrDiscard(ctx) + log.Info("Waiting for DTK build to complete", "doneFlag", doneFlagPath) + + sleepSec := 300 + totalRetries := 10 + totalSleepSec := 0 + + for totalRetries > 0 { + if _, err := d.os.Stat(doneFlagPath); err == nil { + log.Info("DTK build completed") + return nil + } + + log.Info("Awaiting DTK compilation", "next_query_sec", sleepSec) + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(time.Duration(sleepSec) * time.Second): + } + + totalSleepSec += sleepSec + if sleepSec > 10 { + sleepSec /= 2 + } + totalRetries-- + } + + return fmt.Errorf("timeout (%d sec) awaiting DTK compilation, %s not found", totalSleepSec, doneFlagPath) +} + +// dtkFinalizeDriverBuild copies the built artifacts back to the inventory +func (d *driverMgr) dtkFinalizeDriverBuild(ctx context.Context, sharedDir, inventoryPath string) error { + log := logr.FromContextOrDiscard(ctx) + log.Info("Finalizing DTK driver build", "inventoryPath", inventoryPath) + + if err := d.createInventoryDirectory(ctx, inventoryPath); err != nil { + return err + } + + // Construct path to RPMs in shared dir + // Matches bash: rpms_path="${DTK_OCP_NIC_SHARED_DIR}/MLNX_OFED_SRC-${NVIDIA_NIC_DRIVER_VER}/RPMS/redhat-release-*/${ARCH}/" + arch := d.getArchitecture(ctx) + srcDirName := fmt.Sprintf("MLNX_OFED_SRC-%s", d.cfg.NvidiaNicDriverVer) + // We need to handle the wildcard "redhat-release-*" + rpmsBase := filepath.Join(sharedDir, srcDirName, "RPMS") + + // Find the redhat-release directory + entries, err := d.os.ReadDir(rpmsBase) + if err != nil { + return fmt.Errorf("failed to read RPMS directory %s: %w", rpmsBase, err) + } + + var redhatDir string + for _, entry := range entries { + if entry.IsDir() && strings.HasPrefix(entry.Name(), "redhat-release-") { + redhatDir = entry.Name() + break + } + } + + if redhatDir == "" { + return fmt.Errorf("redhat-release directory not found in %s", rpmsBase) + } + + rpmsPath := filepath.Join(rpmsBase, redhatDir, arch) + + // Copy RPMs + // Matches bash: cp -rf ${rpms_path}/*.rpm ${driver_inventory_path}/ + log.Info("Copying RPMs", "from", rpmsPath, "to", inventoryPath) + + // Copy RPMs using glob to avoid shell injection + rpmsGlob := filepath.Join(rpmsPath, "*.rpm") + files, err := filepath.Glob(rpmsGlob) + if err != nil { + return fmt.Errorf("failed to glob RPM files: %w", err) + } + if len(files) == 0 { + return fmt.Errorf("no RPM files found in %s", rpmsPath) + } + + for _, file := range files { + dest := filepath.Join(inventoryPath, filepath.Base(file)) + if _, _, err := d.cmd.RunCommand(ctx, "cp", "-f", file, dest); err != nil { + return fmt.Errorf("failed to copy %s: %w", file, err) + } + } + + return nil +} + +// copyDir copies a directory recursively +func (d *driverMgr) copyDir(ctx context.Context, src, dest string) error { + // Using cp -rT to treat dest as a normal file (directory) + // This ensures contents of src are copied into dest, not src into dest/src + _, _, err := d.cmd.RunCommand(ctx, "cp", "-rT", src, dest) + return err +} diff --git a/entrypoint/internal/driver/driver_test.go b/entrypoint/internal/driver/driver_test.go index ad4770f..1158f78 100644 --- a/entrypoint/internal/driver/driver_test.go +++ b/entrypoint/internal/driver/driver_test.go @@ -1305,13 +1305,20 @@ var _ = Describe("Driver", func() { // Mock copyBuildArtifacts - debug logging and copy cmdMock.EXPECT().RunCommand(ctx, "uname", "-m").Return("x86_64", "", nil) - cmdMock.EXPECT().RunCommand(ctx, "sh", "-c", mock.Anything).Return("", "", nil) // ls -la source directory - cmdMock.EXPECT().RunCommand(ctx, "sh", "-c", mock.Anything).Return("", "", nil) // find .deb files - cmdMock.EXPECT().RunCommand(ctx, "sh", "-c", mock.Anything).Return("", "", nil) // ls -la destination directory - cmdMock.EXPECT().RunCommand(ctx, "sh", "-c", mock.Anything).Return("", "", nil) // cp command + cmdMock.EXPECT().RunCommand(ctx, "sh", "-c", mock.Anything).Return("", "", nil).Times(4) + + // Mock fixSourceLink + cmdMock.EXPECT().RunCommand(ctx, "uname", "-m").Return("x86_64", "", nil) + osMock.EXPECT().Readlink(mock.Anything).Return("/usr/src/ofa_kernel/x86_64/5.4.0-42-generic", nil) // Mock storeBuildChecksum - return valid checksum - cmdMock.EXPECT().RunCommand(ctx, "sh", "-c", mock.Anything).Return("abc123def456", "", nil) + // Use a more specific matcher for the command to avoid matching other sh -c calls + cmdMock.EXPECT().RunCommand(ctx, "sh", "-c", mock.MatchedBy(func(cmd string) bool { + return strings.Contains(cmd, "md5sum") + })).Return("abc123def456", "", nil) + + // Mock WriteFile failure + osMock.EXPECT().WriteFile(mock.Anything, mock.Anything, os.FileMode(0o644)).Return(errors.New("write failed")) err := dm.Build(ctx) Expect(err).To(HaveOccurred()) diff --git a/entrypoint/internal/dtk/build.go b/entrypoint/internal/dtk/build.go new file mode 100644 index 0000000..c116e07 --- /dev/null +++ b/entrypoint/internal/dtk/build.go @@ -0,0 +1,138 @@ +/* + Copyright 2025, NVIDIA CORPORATION & AFFILIATES + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package dtk + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/go-logr/logr" + "github.com/kballard/go-shellquote" + + "github.com/Mellanox/doca-driver-build/entrypoint/internal/config" + "github.com/Mellanox/doca-driver-build/entrypoint/internal/utils/cmd" +) + +// RunBuild executes the DTK driver build logic +func RunBuild(ctx context.Context, log logr.Logger, cfg config.Config, cmdHelper cmd.Interface) error { + log.Info("DTK driver build script start") + + if cfg.DtkOcpStartCompileFlag == "" || cfg.DtkOcpDoneCompileFlag == "" || + cfg.DtkOcpNicSharedDir == "" || cfg.DtkOcpCompiledDriverVer == "" { + err := fmt.Errorf("required DTK environment variables not set: %s, %s, %s, %s", + cfg.DtkOcpStartCompileFlag, cfg.DtkOcpDoneCompileFlag, cfg.DtkOcpNicSharedDir, cfg.DtkOcpCompiledDriverVer) + log.Error(err, "aborting") + return err + } + + // Install dependencies + // Req. for /install.pl script + log.Info("Installing perl") + if _, _, err := cmdHelper.RunCommand(ctx, "dnf", "install", "-y", "perl"); err != nil { + return fmt.Errorf("failed to install perl: %w", err) + } + + // Req. for build + log.Info("Installing build dependencies") + deps := []string{"ethtool", "autoconf", "pciutils", "automake", "libtool", "python3-devel"} + args := append([]string{"install", "-y"}, deps...) + if _, _, err := cmdHelper.RunCommand(ctx, "dnf", args...); err != nil { + return fmt.Errorf("failed to install build dependencies: %w", err) + } + + // Wait for start flag + retryDelay := 3 * time.Second + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + if _, err := os.Stat(cfg.DtkOcpStartCompileFlag); err == nil { + break + } + log.Info("Awaiting driver container preparations prior compilation", "next_query_sec", retryDelay.Seconds()) + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(retryDelay): + } + } + + log.Info("Starting compilation of driver", "version", cfg.DtkOcpCompiledDriverVer) + + // Construct install.pl command + // ${DTK_OCP_NIC_SHARED_DIR}/MLNX_OFED_SRC-${DTK_OCP_COMPILED_DRIVER_VER}/install.pl + srcDirName := fmt.Sprintf("MLNX_OFED_SRC-%s", cfg.DtkOcpCompiledDriverVer) + installScript := filepath.Join(cfg.DtkOcpNicSharedDir, srcDirName, "install.pl") + + installArgs := []string{ + installScript, + "--build-only", + "--kernel-only", + "--without-knem", + "--without-iser", + "--without-isert", + "--without-srp", + "--with-mlnx-tools", + "--with-ofed-scripts", + "--copy-ifnames-udev", + } + + if cfg.AppendDriverBuildFlags != "" { + // Use shell-style parsing to handle quoted arguments correctly + flags, err := shellquote.Split(cfg.AppendDriverBuildFlags) + if err != nil { + return fmt.Errorf("failed to parse APPEND_DRIVER_BUILD_FLAGS: %w", err) + } + installArgs = append(installArgs, flags...) + } + + // Execute build + log.Info("Executing build command", "command", installArgs[0], "args", installArgs[1:]) + if _, _, err := cmdHelper.RunCommand(ctx, installArgs[0], installArgs[1:]...); err != nil { + // Check if error is context canceled + if ctx.Err() != nil { + log.Info("Build canceled by context") + return ctx.Err() + } + return fmt.Errorf("driver build failed: %w", err) + } + + // Create done flag + if _, err := os.Create(cfg.DtkOcpDoneCompileFlag); err != nil { + return fmt.Errorf("failed to create done flag: %w", err) + } + + // Remove start flag + if err := os.Remove(cfg.DtkOcpStartCompileFlag); err != nil { + log.Error(err, "failed to remove start flag") + // Non-fatal + } + + log.Info("DTK driver build script end") + + // Sleep infinity with context support + log.Info("Build completed, sleeping indefinitely") + <-ctx.Done() + return ctx.Err() +} diff --git a/entrypoint/internal/dtk/build_test.go b/entrypoint/internal/dtk/build_test.go new file mode 100644 index 0000000..6514a9a --- /dev/null +++ b/entrypoint/internal/dtk/build_test.go @@ -0,0 +1,120 @@ +/* + Copyright 2025, NVIDIA CORPORATION & AFFILIATES + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package dtk + +import ( + "context" + "errors" + "os" + "path/filepath" + "testing" + "time" + + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + + "github.com/Mellanox/doca-driver-build/entrypoint/internal/config" + cmdMockPkg "github.com/Mellanox/doca-driver-build/entrypoint/internal/utils/cmd/mocks" +) + +func TestRunBuild(t *testing.T) { + log := logr.Discard() + tempDir := t.TempDir() + + startFlag := filepath.Join(tempDir, "dtk_start_compile") + doneFlag := filepath.Join(tempDir, "dtk_done_compile") + + cfg := config.Config{ + DtkOcpStartCompileFlag: startFlag, + DtkOcpDoneCompileFlag: doneFlag, + DtkOcpCompiledDriverVer: "1.0.0", + DtkOcpNicSharedDir: tempDir, + } + + t.Run("should fail if flags are not set", func(t *testing.T) { + err := RunBuild(context.Background(), log, config.Config{}, nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "required DTK environment variables not set") + }) + + t.Run("should fail if perl installation fails", func(t *testing.T) { + cmdMock := cmdMockPkg.NewInterface(t) + cmdMock.EXPECT().RunCommand(mock.Anything, "dnf", "install", "-y", "perl").Return("", "", errors.New("dnf failed")) + + err := RunBuild(context.Background(), log, cfg, cmdMock) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed to install perl") + }) + + t.Run("should fail if build dependencies installation fails", func(t *testing.T) { + cmdMock := cmdMockPkg.NewInterface(t) + cmdMock.EXPECT().RunCommand(mock.Anything, "dnf", "install", "-y", "perl").Return("", "", nil) + cmdMock.EXPECT().RunCommand(mock.Anything, "dnf", "install", "-y", "ethtool", "autoconf", "pciutils", "automake", "libtool", "python3-devel").Return("", "", errors.New("dnf failed")) + + err := RunBuild(context.Background(), log, cfg, cmdMock) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed to install build dependencies") + }) + + t.Run("should wait for start flag and run build", func(t *testing.T) { + cmdMock := cmdMockPkg.NewInterface(t) + cmdMock.EXPECT().RunCommand(mock.Anything, "dnf", "install", "-y", "perl").Return("", "", nil) + cmdMock.EXPECT().RunCommand(mock.Anything, "dnf", "install", "-y", "ethtool", "autoconf", "pciutils", "automake", "libtool", "python3-devel").Return("", "", nil) + + // Create start flag after a short delay + go func() { + time.Sleep(100 * time.Millisecond) + f, err := os.Create(startFlag) + assert.NoError(t, err) + f.Close() + }() + + expectedInstallScript := filepath.Join(tempDir, "MLNX_OFED_SRC-1.0.0", "install.pl") + cmdMock.EXPECT().RunCommand(mock.Anything, expectedInstallScript, + "--build-only", "--kernel-only", "--without-knem", "--without-iser", "--without-isert", + "--without-srp", "--with-mlnx-tools", "--with-ofed-scripts", "--copy-ifnames-udev").Return("", "", nil) + + // Create a context that we can cancel to simulate end of execution + ctx, cancel := context.WithCancel(context.Background()) + + // Run in a goroutine so we can cancel it + errCh := make(chan error) + go func() { + errCh <- RunBuild(ctx, log, cfg, cmdMock) + }() + + // Wait for done flag to be created + // Increased timeout to account for retryDelay in RunBuild + assert.Eventually(t, func() bool { + _, err := os.Stat(doneFlag) + return err == nil + }, 5*time.Second, 100*time.Millisecond) + + // Wait for start flag to be removed + assert.Eventually(t, func() bool { + _, err := os.Stat(startFlag) + return os.IsNotExist(err) + }, 5*time.Second, 100*time.Millisecond) + + // Cancel context to stop the infinite loop + cancel() + + err := <-errCh + assert.ErrorIs(t, err, context.Canceled) + }) +} diff --git a/entrypoint/internal/utils/cmd/cmd.go b/entrypoint/internal/utils/cmd/cmd.go index 8d12909..e2e8292 100644 --- a/entrypoint/internal/utils/cmd/cmd.go +++ b/entrypoint/internal/utils/cmd/cmd.go @@ -58,6 +58,13 @@ func (c *cmd) RunCommand(ctx context.Context, command string, args ...string) (s var stdout, stderr bytes.Buffer cmd := exec.CommandContext(ctx, command, args...) + // Ensure child process is killed when context is canceled + cmd.Cancel = func() error { + if cmd.Process == nil { + return nil + } + return cmd.Process.Signal(syscall.SIGTERM) + } cmd.Stdout = &stdout cmd.Stderr = &stderr