diff --git a/include/cloysterhpc/services/options.h b/include/cloysterhpc/services/options.h index adc02e0e..1f481069 100644 --- a/include/cloysterhpc/services/options.h +++ b/include/cloysterhpc/services/options.h @@ -21,7 +21,7 @@ struct Options final { bool runAsDaemon; bool airGap; bool unattended; - bool disableMirrors; + bool enableMirrors; std::size_t logLevelInput; std::string error; std::string config; diff --git a/include/cloysterhpc/services/runner.h b/include/cloysterhpc/services/runner.h index 5db3d060..0ee8ee18 100644 --- a/include/cloysterhpc/services/runner.h +++ b/include/cloysterhpc/services/runner.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -17,35 +18,96 @@ #include #include -namespace cloyster::services::runner { +namespace cloyster::services::runner::shell { + +namespace unsafe { + template + [[nodiscard]] + int fmt(std::vector& output, + fmt::format_string format, Args&&... args) + { + auto command = fmt::format(format, std::forward(args)...); + + auto opts = cloyster::Singleton::get(); + if (!opts->dryRun) { + LOG_DEBUG("Running shell command: {}", command); + boost::process::ipstream pipe_stream; + boost::process::child child("/bin/bash", "-c", command, + boost::process::std_out > pipe_stream); + + std::string line; + while (pipe_stream && std::getline(pipe_stream, line)) { + output.emplace_back(line); + LOG_TRACE("{}", line); + } + + child.wait(); + LOG_DEBUG("Exit code: {}", child.exit_code()); + return child.exit_code(); + } else { + LOG_INFO("Dry Run: {}", command); + return 0; + } + } -template -int shellfmt(fmt::format_string fmt, Args&&... args) -{ - auto command = fmt::format(fmt, std::forward(args)...); - - auto opts = cloyster::Singleton::get(); - if (!opts->dryRun) { - LOG_DEBUG("Running shell command: {}", command); - boost::process::ipstream pipe_stream; - boost::process::child child( - "/bin/bash", "-c", command, boost::process::std_out > pipe_stream); - - std::string line; - while (pipe_stream && std::getline(pipe_stream, line)) { - LOG_TRACE("{}", line); + template + [[nodiscard]] + int fmt(fmt::format_string format, Args&&... args) + { + auto command = fmt::format(format, std::forward(args)...); + + auto opts = cloyster::Singleton::get(); + if (!opts->dryRun) { + LOG_DEBUG("Running shell command: {}", command); + boost::process::ipstream pipe_stream; + boost::process::child child("/bin/bash", "-c", command, + boost::process::std_out > pipe_stream); + + std::string line; + while (pipe_stream && std::getline(pipe_stream, line)) { + LOG_TRACE("{}", line); + } + + child.wait(); + LOG_DEBUG("Exit code: {}", child.exit_code()); + return child.exit_code(); + } else { + LOG_INFO("Dry Run: {}", command); + return 0; } + } +} - child.wait(); - LOG_DEBUG("Exit code: {}", child.exit_code()); - return child.exit_code(); - } else { - LOG_INFO("Dry Run: {}", command); - return 0; +template +void fmt(fmt::format_string format, Args&&... args) +{ + const std::string command + = fmt::format(format, std::forward(args)...); + const auto exitCode = unsafe::fmt("{}", command); + if (exitCode != 0) { + throw std::runtime_error(fmt::format( + "Command {} failed with exit code {}", command, exitCode)); } } -int shell(std::string_view cmd); +void cmd(std::string_view cmd); + +template +[[nodiscard]] +std::string output(fmt::format_string format, Args&&... args) +{ + std::vector output; + // Cosntruct a command here because it will be used twice and + // we can't use args twice without hurting the perfect forwarding + const std::string command + = fmt::format(format, std::forward(args)...); + const auto exitCode = unsafe::fmt(output, "{}", command); + if (exitCode != 0) { + throw std::runtime_error(fmt::format( + "Command {} failed with exit code {}", command, exitCode)); + } + return fmt::format("{}", fmt::join(output, "\n")); +} } diff --git a/repos/repos.conf b/repos/repos.conf index ed303730..889c3b28 100644 --- a/repos/repos.conf +++ b/repos/repos.conf @@ -122,8 +122,9 @@ filename=xcat-deps.repo upstream.repo=http://xcat.org/files/xcat/repos/yum/devel/xcat-dep/rh{releasever}/{arch}/ upstream.gpgkey=http://xcat.org/files/xcat/repos/yum/devel/xcat-dep/rh{releasever}/{arch}/repodata/repomd.xml.key +# ofedVersion is configured in the answerfile at ofed.version [doca] name=NVIDIA DOCA Repository - RHEL rhel{osversion} filename=mlx-doca.repo -upstream.repo=https://linux.mellanox.com/public/repo/doca/latest/rhel{osversion}/{arch}/ +upstream.repo=https://linux.mellanox.com/public/repo/doca/{ofedVersion}/rhel{osversion}/{arch}/ upstream.gpgkey=https://linux.mellanox.com/public/repo/doca/latest/rhel{osversion}/{arch}/GPG-KEY-Mellanox.pub diff --git a/rpmspecs/opencattus.spec b/rpmspecs/opencattus.spec index 03657ab4..4354467b 100644 --- a/rpmspecs/opencattus.spec +++ b/rpmspecs/opencattus.spec @@ -1,6 +1,6 @@ Name: opencattus-installer Version: 1.0 -Release: 2 +Release: 4 Summary: OpenCATTUS Installer License: Apache 2.0 URL: https://versatushpc.com.br/opencattus/ @@ -49,6 +49,10 @@ install -m 644 repos/rocky-vault.conf %{buildroot}/opt/cloysterhpc/conf/repos/ro /opt/cloysterhpc/conf/repos/rocky-vault.conf %changelog +* Thu Aug 14 2025 Daniel Hilst - 1.0-4 - Bugfix +- Update OFED +- Dump configuration +- Add support for Rocky Linux 9.6 * Wed Jul 16 2025 Daniel Hilst - 1.0-3 - Add ansible roles - Add ansible roles implementation - Fix dnssec configuration generation in xCAT plugin diff --git a/src/diskImage.cpp b/src/diskImage.cpp index c2d2b969..09fe6b2e 100644 --- a/src/diskImage.cpp +++ b/src/diskImage.cpp @@ -38,30 +38,43 @@ void DiskImage::setPath(const std::filesystem::path& path) bool DiskImage::isKnownImage(const std::filesystem::path& path) { + constexpr auto chooseDistro = [](std::string_view imageView) + -> std::optional { + if (imageView.starts_with("Rocky")) { + return cloyster::models::OS::Distro::Rocky; + } else if (imageView.starts_with("rhel")) { + return cloyster::models::OS::Distro::RHEL; + } else if (imageView.starts_with("OracleLinux")) { + return cloyster::models::OS::Distro::OL; + } else if (imageView.starts_with("AlmaLinux")) { + return cloyster::models::OS::Distro::AlmaLinux; + } else { + return std::nullopt; + } + }; + for (const auto& image : m_knownImageFilename) { if (path.filename().string() == image) { LOG_TRACE("Disk image is recognized") auto imageView = std::string_view(image); - if (imageView.starts_with("Rocky")) { - m_distro = cloyster::models::OS::Distro::Rocky; - } else if (imageView.starts_with("rhel")) { - m_distro = cloyster::models::OS::Distro::RHEL; - } else if (imageView.starts_with("OracleLinux")) { - m_distro = cloyster::models::OS::Distro::OL; - } else if (imageView.starts_with("AlmaLinux")) { - m_distro = cloyster::models::OS::Distro::AlmaLinux; - } else { - throw std::logic_error(fmt::format( - "Can't determine the distro for the image {}", image)); + const auto distro = chooseDistro(imageView); + if (distro) { + m_distro = distro; + return true; } - - return true; } } - LOG_TRACE("Disk image is unknown. Maybe you're using a custom image or " - "changed the default name?"); + const auto distro + = chooseDistro(std::string_view(path.filename().string())); + if (distro) { + m_distro = distro; + return true; + } + cloyster::functions::abort( + "Disk image is unknown. Maybe you're using a custom image or " + "changed the default name?"); return false; } diff --git a/src/models/answerfile.cpp b/src/models/answerfile.cpp index 13fe8a0d..fd4aea1a 100644 --- a/src/models/answerfile.cpp +++ b/src/models/answerfile.cpp @@ -7,10 +7,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -426,7 +428,24 @@ void AnswerFile::loadSystemSettings() } system.version = m_keyfile.getString("system", "version"); - system.kernel = m_keyfile.getString("system", "kernel"); + const auto kernel = m_keyfile.getStringOpt("system", "kernel"); + if (kernel) { + system.kernel = kernel.value(); + LOG_INFO("Kernel override in the answerfile {}", system.kernel); + } else { + const auto latestKernelVersion = services::runner::shell::output( + // This runs very early so it stops loading all repositories caches, + // which is unecessary, so I pinned --repo=appstream here + "dnf list --repo=appstream kernel-devel --available " + "--showduplicates | sed 1d | " + // captures the kernel arch, e.g. x86_64, in $1 + // this is important later when used to access /lib/modules/... + // folders + R"(perl -lane '$F[0] =~ s/kernel-devel\.(.*)$//; printf "%s.%s\n", $F[1], $1' | tail -1)"); + system.kernel = latestKernelVersion; + LOG_INFO("Kernel omitted in the answerfile, using running kernel {}", + system.kernel); + } } AFNode AnswerFile::loadNode(const std::string& section) diff --git a/src/ofed.cpp b/src/ofed.cpp index 644e4aa5..37369218 100644 --- a/src/ofed.cpp +++ b/src/ofed.cpp @@ -3,6 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include + #include #include #include @@ -47,6 +49,8 @@ bool OFED::installed() const void OFED::install() const { const auto opts = cloyster::Singleton::get(); + const auto cluster = cloyster::Singleton::get(); + const auto osinfo = cluster->getNodes()[0].getOS(); if (opts->dryRun) { LOG_WARN("Dry-Run: Skiping OFED installation"); @@ -73,14 +77,16 @@ void OFED::install() const cloyster::services::repos::RepoManager>::get(); auto osService = cloyster::Singleton::get(); + const std::string kernelVersion = std::string(osinfo.getKernel()); repoManager->enable("doca"); // Install the required packages runner->checkCommand("dnf makecache --repo=doca"); runner->checkCommand( - "dnf -y install kernel kernel-devel doca-extra"); + fmt::format("dnf -y install kernel-{kernelVersion} " + "kernel-devel-{kernelVersion} doca-extra", + fmt::arg("kernelVersion", kernelVersion))); - if (osService->getKernelRunning() - != osService->getKernelInstalled()) { + if (osService->getKernelRunning() != kernelVersion) { LOG_WARN("New kernel installed! Rebooting after the " "installation finishes is advised!"); } @@ -96,10 +102,9 @@ void OFED::install() const // The driver may support weak updates modules and load without // need for reboot. if (!opts->shouldSkip("compile-doca-driver")) { - runner->checkCommand( - "bash -c \"/opt/mellanox/doca/tools/doca-kernel-support -k " - "$(rpm -q --qf \"%{VERSION}-%{RELEASE}.%{ARCH}\n\" " - "kernel-devel)\""); + runner->checkCommand(fmt::format( + "/opt/mellanox/doca/tools/doca-kernel-support -k {}", + kernelVersion)); } // Get the last rpm in /tmp/DOCA*/ folder @@ -123,4 +128,4 @@ void OFED::install() const break; } -} \ No newline at end of file +} diff --git a/src/services/ansible/roles/base.cpp b/src/services/ansible/roles/base.cpp index 98c955a6..44d59b45 100644 --- a/src/services/ansible/roles/base.cpp +++ b/src/services/ansible/roles/base.cpp @@ -58,8 +58,18 @@ ScriptBuilder installScript( builder.addNewLine().addCommand("# Install general base packages"); - std::set allPackages = { "wget", "curl", "dnf-plugins-core", - "chkconfig", "jq", "tar", "python3-dnf-plugin-versionlock" }; + // "python3-dnf-plugin-versionlock" is conflicting with dnf-plugins-core + // during the first install + std::set allPackages = { + "wget", + "curl", + "dnf-plugins-core", + "chkconfig", + "initscripts", // @FIXME: This is only required if the provisioner is + // xCAT + "jq", + "tar", + }; if (const auto iter = role.m_vars.find("base_packages"); iter != role.m_vars.end()) { for (const auto& pkg : diff --git a/src/services/ansible/roles/spack.cpp b/src/services/ansible/roles/spack.cpp index c17e6e80..44244354 100644 --- a/src/services/ansible/roles/spack.cpp +++ b/src/services/ansible/roles/spack.cpp @@ -25,6 +25,8 @@ ScriptBuilder installScript( fmt::format("Expected spack role, found {}", role.m_roleName)); builder.addNewLine() + .addCommand("# Exit early if spack is already installed") + .addCommand("test -d /opt/spack/.git && exit 0") .addCommand("# Install dependencies for Spack") .addPackage("git") .addNewLine() diff --git a/src/services/ansible/roles/todo.txt b/src/services/ansible/roles/todo.txt new file mode 100644 index 00000000..153d2ce4 --- /dev/null +++ b/src/services/ansible/roles/todo.txt @@ -0,0 +1,10 @@ +Base (open) + 1. confluent + 2. RHEL10 sup + +Features (closed) + 1. ood + 2. grafana + 3. slurm + 4. integração ood slurm + 5. lustre diff --git a/src/services/options.cpp b/src/services/options.cpp index a4ef974d..4b21c6fa 100644 --- a/src/services/options.cpp +++ b/src/services/options.cpp @@ -24,7 +24,7 @@ std::unique_ptr options::factory(int argc, const char** argv) .runAsDaemon = false, .airGap = false, .unattended = false, - .disableMirrors = false, + .enableMirrors = false, .logLevelInput = 3, .error = "NO ERROR", .config = "", @@ -47,8 +47,7 @@ std::unique_ptr options::factory(int argc, const char** argv) app.add_flag("-t,--tui", opt.enableTUI, "Enable TUI"); app.add_flag("-c,--cli", opt.enableCLI, "Enable CLI"); app.add_flag("-D,--daemon", opt.runAsDaemon, "Run as daemon"); - app.add_flag( - "--disable-mirrors", opt.disableMirrors, "Disable mirror URLs"); + app.add_flag("--enable-mirrors", opt.enableMirrors, "Disable mirror URLs"); app.add_option("--mirror-url", opt.mirrorBaseUrl, "Base URL for mirror") ->default_str("https://mirror.versatushpc.com.br"); app.add_option( diff --git a/src/services/repos.cpp b/src/services/repos.cpp index 7e3b1437..bf00609c 100644 --- a/src/services/repos.cpp +++ b/src/services/repos.cpp @@ -344,6 +344,7 @@ struct RepoConfigVars final { std::string releasever; // major, ex: 9 std::string xcatVersion; // major.minor, ex: 2.17 or latest std::string zabbixVersion; // major.minor, ex: 6.4 + std::string ofedVersion; // major.minor, ex: 6.4 }; // Represents a Mirror Repository @@ -488,7 +489,7 @@ struct RepoChooser final { } const auto opts = cloyster::Singleton::get(); - if (opts->disableMirrors) { + if (!opts->enableMirrors) { return Choice::UPSTREAM; } @@ -522,7 +523,7 @@ TEST_CASE("RepoChooser") = { .repo = "https://upstream.example.com/upstream/repo", .gpgkey = "https://upstream.example.com/upstream/key.gpg" } }; - cloyster::Singleton::get()->disableMirrors = false; + cloyster::Singleton::get()->enableMirrors = true; auto choice1 = RepoChooser::choose(mirrorConfigOnline, upstreamConfig); CHECK(choice1 == RepoChooser::Choice::MIRROR); auto choice2 = RepoChooser::choose(mirrorConfigOffline, upstreamConfig); @@ -587,7 +588,7 @@ TEST_CASE("RepoAssembler") == RepoChooser::Choice::MIRROR); // Disable mirrors - cloyster::Singleton::get()->disableMirrors = true; + cloyster::Singleton::get()->enableMirrors = false; // If mirrors are disabled it should choose the upstream even if the // mirror is online @@ -605,7 +606,7 @@ TEST_CASE("RepoAssembler") CHECK(repoUpstream.baseurl().value() == upstreamConfig.baseurl()); // Enable mirrors again - cloyster::Singleton::get()->disableMirrors = false; + cloyster::Singleton::get()->enableMirrors = true; auto repoMirror = RepoAssembler::assemble(repoId, mirrorConfigOnline, upstreamConfig); // CHECK(repoMirror.baseurl().value() == mirrorConfigOnline.baseurl()); @@ -684,7 +685,8 @@ class RepoConfigParser final { fmt::arg("beegfsVersion", vars.beegfsVersion), fmt::arg("zabbixVersion", vars.zabbixVersion), fmt::arg("xcatVersion", vars.xcatVersion), - fmt::arg("ohpcVersion", vars.ohpcVersion)); + fmt::arg("ohpcVersion", vars.ohpcVersion), + fmt::arg("ofedVersion", vars.ofedVersion)); }; public: @@ -807,6 +809,7 @@ class RepoConfigParser final { .releasever = "9", .xcatVersion = "latest", .zabbixVersion = "6.4", + .ofedVersion = "latest-2.9", }) { RepoConfFile conffile; @@ -1337,7 +1340,7 @@ struct RepoGenerator final { TEST_CASE("RepoGenerator") { auto opts = Options { - .disableMirrors = false, + .enableMirrors = true, .mirrorBaseUrl = "https://mirror.example.com", }; const auto osinfo = OS(models::OS::Distro::Rocky, OS::Platform::el9, 5); @@ -1521,8 +1524,9 @@ void RepoManager::initializeDefaultRepositories() return; } LOG_INFO("RepoManager initialization"); - auto osinfo - = cloyster::Singleton::get()->getHeadnode().getOS(); + auto cluster = cloyster::Singleton::get(); + auto osinfo = cluster->getHeadnode().getOS(); + auto ofedVersion = cluster->getOFED()->getVersion(); const auto vars = RepoConfigVars { .arch = cloyster::utils::enums::toString(osinfo.getArch()), @@ -1532,7 +1536,9 @@ void RepoManager::initializeDefaultRepositories() .releasever = fmt::format("{}", osinfo.getMajorVersion()), .xcatVersion = opts->xcatVersion, .zabbixVersion = opts->zabbixVersion, + .ofedVersion = ofedVersion, }; + switch (osinfo.getPackageType()) { case OS::PackageType::RPM: { // Generate the repository files @@ -1543,6 +1549,11 @@ void RepoManager::initializeDefaultRepositories() m_impl->rpm.loadBaseDir(); // Enable the repositories m_impl->rpm.enable(repos, true); + + LOG_INFO("Enabling dnf keepcache option, use `dnf config-manager " + "--save --setopt=keepcache=False` to disable it") + runner::shell::cmd("grep -q '^keepcache=' /etc/dnf/dnf.conf || dnf " + "config-manager --save --setopt=keepcache=True"); } break; case OS::PackageType::DEB: throw std::logic_error("DEB packages not implemented"); diff --git a/src/services/runner.cpp b/src/services/runner.cpp index 3ce6df62..1b9b55b0 100644 --- a/src/services/runner.cpp +++ b/src/services/runner.cpp @@ -84,11 +84,10 @@ int runCommand(const std::string& command, bool overrideDryRun) } }; // namespace { -// -namespace cloyster::services::runner { +namespace cloyster::services::runner::shell { -int shell(std::string_view cmd) { return shellfmt("{}", cmd); } +void cmd(std::string_view cmd) { shell::fmt("{}", cmd); } } @@ -153,8 +152,13 @@ int Runner::run(const ScriptBuilder& script) const std::filesystem::path path = fmt::format("/tmp/{}.sh", hash); functions::installFile(path, std::move(content)); executeCommand(fmt::format("chmod +x {}", path)); - executeCommand(path); - return 0; + const auto exitCode = executeCommand(path); + if (exitCode != 0) { + cloyster::functions::abort( + "Script {} failed with exit code {}", path, exitCode); + } + + return exitCode; } CommandProxy Runner::executeCommandIter(const std::string& cmd, Stream /*out*/) diff --git a/src/services/shell.cpp b/src/services/shell.cpp index 70980e85..e99e02a1 100644 --- a/src/services/shell.cpp +++ b/src/services/shell.cpp @@ -40,6 +40,44 @@ using cloyster::services::IRunner; namespace { +void dumpPreInstallState() +{ + using namespace cloyster::services::runner; + const auto opts = cloyster::Singleton::get(); + + LOG_INFO("Dumping cluster state before the installation begins") + + LOG_INFO("OS"); + shell::cmd("cat /etc/os-release"); + + LOG_INFO("Repositories URLs"); + shell::cmd( + "grep -EH '^(mirrorlist|baseurl)' /etc/yum.repos.d/*.repo || true"); + + LOG_INFO("Packages installed"); + shell::cmd("rpm -qa"); + + LOG_INFO("Network configuration"); + shell::cmd("ip a"); + shell::cmd("ip link"); + + LOG_INFO("Kernel version"); + shell::cmd("uname -a"); + + LOG_INFO("Memory"); + shell::cmd("free -m"); + + LOG_INFO("Services running"); + shell::cmd("systemctl --no-pager list-units --plain --type=service --all"); + + LOG_INFO("Firewall configuration"); + // firewalld may not be running + shell::cmd("firewall-cmd --list-all-zones || true"); + + LOG_INFO("End of cluster state dump"); + opts->maybeStopAfterStep("dump-cluster-state"); +} + auto getToEnableRepoNames(const OS& osinfo) { switch (osinfo.getPlatform()) { @@ -248,23 +286,24 @@ void Shell::configureNetworks(const std::list& connections) fmt::format("nmcli device set {} managed yes", interface)); ::runner()->executeCommand( fmt::format("nmcli device set {} autoconnect yes", interface)); - ::runner()->executeCommand( - fmt::format("nmcli connection add con-name {} ifname {} type {} " - "mtu {} ipv4.method manual ipv4.address {}/{} " - "ipv4.dns \"{}\" " - // "ipv4.gateway {} ipv4.dns \"{}\" " - "ipv4.dns-search {} ipv6.method disabled", - cloyster::utils::enums::toString( - connection.getNetwork()->getProfile()), - interface, - cloyster::utils::enums::toString( - connection.getNetwork()->getType()), - connection.getMTU(), connection.getAddress().to_string(), - connection.getNetwork()->cidr.at( - connection.getNetwork()->getSubnetMask().to_string()), - // connection.getNetwork()->getGateway().to_string(), - fmt::join(formattedNameservers, " "), - connection.getNetwork()->getDomainName())); + ::runner()->executeCommand(fmt::format( + "nmcli connection add con-name {} ifname {} type {} " + "mtu {} ipv4.method manual ipv4.address {}/{} " + "ipv4.dns \"{}\" " + // "ipv4.gateway {} ipv4.dns \"{}\" " + // @FIXME: This will break Confluent, is it required by xCAT? + "ipv4.dns-search {} ipv6.method disabled", + cloyster::utils::enums::toString( + connection.getNetwork()->getProfile()), + interface, + cloyster::utils::enums::toString( + connection.getNetwork()->getType()), + connection.getMTU(), connection.getAddress().to_string(), + connection.getNetwork()->cidr.at( + connection.getNetwork()->getSubnetMask().to_string()), + // connection.getNetwork()->getGateway().to_string(), + fmt::join(formattedNameservers, " "), + connection.getNetwork()->getDomainName())); /* Give network manage some time to settle thing up * Avoids: Error: Connection activation failed: IP configuration could @@ -402,6 +441,10 @@ void Shell::pinOSVersion() */ void Shell::install() { + // Dump the state of the cluster before start the installation, this + // will output a lot of helpful information in the logs + dumpPreInstallState(); + const auto opts = cloyster::Singleton::get(); const auto osinfo = os(); configureRepositories(); @@ -438,7 +481,6 @@ void Shell::install() "ro,no_subtree_check"); const auto nfsInstallScript = networkFileSystem.installScript(cluster()->getHeadnode().getOS()); - ::runner()->run(nfsInstallScript); opts->maybeStopAfterStep("nfs-setup"); configureQueueSystem(); if (cluster()->getMailSystem().has_value()) { @@ -469,6 +511,9 @@ void Shell::install() LOG_INFO("[{}] Installing provisioner packages", provisionerName) provisioner->installPackages(); + // NFS requires /install and /tftpboot folders + ::runner()->run(nfsInstallScript); + LOG_INFO("[{}] Patching the provisioner", provisionerName) provisioner->patchInstall(); @@ -501,6 +546,10 @@ void Shell::install() provisionerName); provisioner->setNodesBoot(); provisioner->resetNodes(); + + // Fix slurmctld: error: Check for out of sync clocks + LOG_INFO("Synchronizing clocks"); + osservice()->restartService("chronyd"); } } diff --git a/src/services/xcat.cpp b/src/services/xcat.cpp index 818814ca..e6dcb5ac 100644 --- a/src/services/xcat.cpp +++ b/src/services/xcat.cpp @@ -5,6 +5,7 @@ #include #include // setenv / getenv +#include #include #include @@ -82,7 +83,6 @@ XCAT::Image XCAT::getImage() const { return m_stateless; } void XCAT::installPackages() { auto osservice = cloyster::Singleton::get(); - osservice->install("initscripts"); osservice->install("xCAT"); } @@ -106,7 +106,7 @@ void XCAT::patchInstall() "sed -i \"s/-extensions server //g\" " "/opt/xcat/share/xcat/scripts/setup-server-cert.sh"); - cloyster::services::runner::shell( + cloyster::services::runner::shell::cmd( R"del((cd / && patch --forward --batch -p0 <<'EOF' --- opt/xcat/lib/perl/xCAT_plugin/ddns.pm.orig 2025-07-16 09:53:20.546246189 -0300 +++ opt/xcat/lib/perl/xCAT_plugin/ddns.pm 2025-07-16 09:53:36.614512354 -0300 @@ -197,8 +197,33 @@ void XCAT::copycds(const std::filesystem::path& diskImage) const void XCAT::genimage() { - cloyster::Singleton::get()->checkCommand( - fmt::format("genimage {}", m_stateless.osimage)); + using namespace runner; + const auto osinfo + = cloyster::Singleton::get()->getNodes()[0].getOS(); + const auto kernelVersion = osinfo.getKernel(); + const auto osService = cloyster::Singleton::get(); + if (kernelVersion == osService->getKernelRunning()) { + shell::fmt("genimage {} ", m_stateless.osimage); + return; + } + + LOG_INFO("Customizing the kernel image"); + const auto kernelPackages = fmt::format( + // Pay attention to the spaces, they are required + "kernel-{0} " + "kernel-devel-{0} " + "kernel-core-{0} " + "kernel-modules-{0} " + "kernel-modules-core-{0}", + kernelVersion); + + shell::fmt("mkdir -p /install/kernels/{}", kernelVersion); + shell::fmt("dnf download {} --destdir /install/kernels/{}", kernelPackages, + kernelVersion); + shell::fmt("createrepo /install/kernels/{}", kernelVersion); + shell::fmt("chdef -t osimage {} -p pkgdir=/install/kernels/{}", + m_stateless.osimage, kernelVersion); + shell::fmt("genimage {} -k {}", m_stateless.osimage, kernelVersion); } void XCAT::packimage() @@ -256,6 +281,8 @@ void XCAT::configureTimeService() void XCAT::configureInfiniband() { + const auto osinfo + = cloyster::Singleton::get()->getNodes()[0].getOS(); LOG_INFO("[xCAT] Configuring infiniband"); if (const auto& ofed = cluster()->getOFED()) { switch (ofed->getKind()) { @@ -267,9 +294,6 @@ void XCAT::configureInfiniband() case OFED::Kind::Mellanox: { auto repoManager = cloyster::Singleton::get(); auto runner = cloyster::Singleton::get(); - auto arch = cloyster::utils::enums::toString( - cluster()->getNodes()[0].getOS().getArch()); - auto osService = cloyster::Singleton::get(); auto opts = cloyster::Singleton::get(); // Add the rpm to the image @@ -278,10 +302,7 @@ void XCAT::configureInfiniband() // The kernel modules are build by the OFED.cpp module, see // OFED.cpp - const auto kernelVersion = opts->dryRun - ? "5.14.0-503.33.1.el9_5" - // getKernelInstalled cannot run at dryRun - : osService->getKernelInstalled(); + const auto kernelVersion = osinfo.getKernel(); // Configure Apache to serve the RPM repository const auto repoName = fmt::format("doca-kernel-{}", kernelVersion); @@ -321,7 +342,9 @@ void XCAT::configureInfiniband() void XCAT::configureSLURM() { + // NOTE: hwloc-libs required to fix slurmd m_stateless.otherpkgs.emplace_back("ohpc-slurm-client"); + m_stateless.otherpkgs.emplace_back("hwloc-libs"); // TODO: Deprecate this for SRV entries on DNS: _slurmctld._tcp 0 100 6817 m_stateless.postinstall.emplace_back( @@ -641,7 +664,7 @@ void XCAT::addNodes() // TODO: Create separate functions runner->executeCommand("makehosts"); runner->executeCommand("makedhcp -n"); - runner->executeCommand("makedns -a"); + runner->executeCommand("makedns -n"); runner->executeCommand("makegocons"); setNodesImage(); } diff --git a/test/sample/answerfile/rocky9-base.ini b/test/sample/answerfile/rocky9-base.ini index fcaf3518..8eef4134 100644 --- a/test/sample/answerfile/rocky9-base.ini +++ b/test/sample/answerfile/rocky9-base.ini @@ -16,7 +16,7 @@ domain_name=cluster.example.com # Cloyster must have an external network [network_external] -interface=enp1s0 +interface=enp2s1 #ip_address=192.168.20.25 #subnet_mask=255.255.255.0 #gateway=192.168.122.1 @@ -26,7 +26,7 @@ domain_name=cluster.external.example.com # Cloyster must have an management network [network_management] -interface=enp2s0 +interface=enp2s2 ip_address=192.168.30.254 subnet_mask=255.255.255.0 gateway=192.168.122.1 @@ -47,16 +47,17 @@ nameservers=192.168.122.1 [system] # Full path to the disk image -disk_image=/opt/iso/Rocky-9.5-x86_64-dvd.iso +disk_image=/opt/iso/Rocky-9.6-x86_64-dvd.iso # Supported distros: rhel, ol, rocky distro=rocky -version=9.5 -kernel=5.14.0-427.13.1.el9_4.x86_64 +version=9.6 +# kernel=5.14.0-570.32.1.el9_6.x86_64 +# Comment ofed section to use disable Infiniband [ofed] # kinds: mellanox | inbox kind=mellanox -version=latest +version=latest-2.9-LTS # Generic. If a node.XX section does not have one of these options, they are obtained here. # Comment if you don't want to use generic options. In this case, you MUST fulfill all the node.XX options. diff --git a/test/sample/answerfile/rocky9-noib.ini b/test/sample/answerfile/rocky9-noib.ini index 3e82d4fa..7ea32d74 100644 --- a/test/sample/answerfile/rocky9-noib.ini +++ b/test/sample/answerfile/rocky9-noib.ini @@ -51,7 +51,6 @@ disk_image=/opt/iso/Rocky-9.5-x86_64-dvd.iso # Supported distros: rhel, ol, rocky distro=rocky version=9.5 -kernel=5.14.0-427.13.1.el9_4.x86_64 # [ofed] # # kinds: mellanox | inbox