Skip to content
This repository was archived by the owner on Apr 16, 2026. It is now read-only.

Commit 579d8c5

Browse files
committed
HOTFIX some bugs found during the installation
- Fix exit code unhandling bug - Get kernel version from answerfile - Fix OFED version in repos.conf - Fix image generation with custom kernel - Fix spack idempotency - Fix diskImage use before set - Run clang-format - Enable keepcache in dnf - Fix from where the kernel version is fetched - Use running kernel when kernel option is ommited in the answerfile - Fix slurld error in the computing node - Comment kernel in rocky9-base.ini - Fix makedns command & restart chronyd
1 parent 56a9e89 commit 579d8c5

File tree

17 files changed

+317
-105
lines changed

17 files changed

+317
-105
lines changed

include/cloysterhpc/services/options.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ struct Options final {
2121
bool runAsDaemon;
2222
bool airGap;
2323
bool unattended;
24-
bool disableMirrors;
24+
bool enableMirrors;
2525
std::size_t logLevelInput;
2626
std::string error;
2727
std::string config;

include/cloysterhpc/services/runner.h

Lines changed: 85 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <boost/process.hpp>
1010
#include <fmt/format.h>
1111

12+
#include <stdexcept>
1213
#include <string>
1314
#include <vector>
1415

@@ -17,35 +18,96 @@
1718
#include <cloysterhpc/services/options.h>
1819
#include <cloysterhpc/services/scriptbuilder.h>
1920

20-
namespace cloyster::services::runner {
21+
namespace cloyster::services::runner::shell {
22+
23+
namespace unsafe {
24+
template <typename... Args>
25+
[[nodiscard]]
26+
int fmt(std::vector<std::string>& output,
27+
fmt::format_string<Args...> format, Args&&... args)
28+
{
29+
auto command = fmt::format(format, std::forward<Args>(args)...);
30+
31+
auto opts = cloyster::Singleton<cloyster::services::Options>::get();
32+
if (!opts->dryRun) {
33+
LOG_DEBUG("Running shell command: {}", command);
34+
boost::process::ipstream pipe_stream;
35+
boost::process::child child("/bin/bash", "-c", command,
36+
boost::process::std_out > pipe_stream);
37+
38+
std::string line;
39+
while (pipe_stream && std::getline(pipe_stream, line)) {
40+
output.emplace_back(line);
41+
LOG_TRACE("{}", line);
42+
}
43+
44+
child.wait();
45+
LOG_DEBUG("Exit code: {}", child.exit_code());
46+
return child.exit_code();
47+
} else {
48+
LOG_INFO("Dry Run: {}", command);
49+
return 0;
50+
}
51+
}
2152

22-
template <typename... Args>
23-
int shellfmt(fmt::format_string<Args...> fmt, Args&&... args)
24-
{
25-
auto command = fmt::format(fmt, std::forward<Args>(args)...);
26-
27-
auto opts = cloyster::Singleton<cloyster::services::Options>::get();
28-
if (!opts->dryRun) {
29-
LOG_DEBUG("Running shell command: {}", command);
30-
boost::process::ipstream pipe_stream;
31-
boost::process::child child(
32-
"/bin/bash", "-c", command, boost::process::std_out > pipe_stream);
33-
34-
std::string line;
35-
while (pipe_stream && std::getline(pipe_stream, line)) {
36-
LOG_TRACE("{}", line);
53+
template <typename... Args>
54+
[[nodiscard]]
55+
int fmt(fmt::format_string<Args...> format, Args&&... args)
56+
{
57+
auto command = fmt::format(format, std::forward<Args>(args)...);
58+
59+
auto opts = cloyster::Singleton<cloyster::services::Options>::get();
60+
if (!opts->dryRun) {
61+
LOG_DEBUG("Running shell command: {}", command);
62+
boost::process::ipstream pipe_stream;
63+
boost::process::child child("/bin/bash", "-c", command,
64+
boost::process::std_out > pipe_stream);
65+
66+
std::string line;
67+
while (pipe_stream && std::getline(pipe_stream, line)) {
68+
LOG_TRACE("{}", line);
69+
}
70+
71+
child.wait();
72+
LOG_DEBUG("Exit code: {}", child.exit_code());
73+
return child.exit_code();
74+
} else {
75+
LOG_INFO("Dry Run: {}", command);
76+
return 0;
3777
}
78+
}
79+
}
3880

39-
child.wait();
40-
LOG_DEBUG("Exit code: {}", child.exit_code());
41-
return child.exit_code();
42-
} else {
43-
LOG_INFO("Dry Run: {}", command);
44-
return 0;
81+
template <typename... Args>
82+
void fmt(fmt::format_string<Args...> format, Args&&... args)
83+
{
84+
const std::string command
85+
= fmt::format(format, std::forward<Args>(args)...);
86+
const auto exitCode = unsafe::fmt("{}", command);
87+
if (exitCode != 0) {
88+
throw std::runtime_error(fmt::format(
89+
"Command {} failed with exit code {}", command, exitCode));
4590
}
4691
}
4792

48-
int shell(std::string_view cmd);
93+
void cmd(std::string_view cmd);
94+
95+
template <typename... Args>
96+
[[nodiscard]]
97+
std::string output(fmt::format_string<Args...> format, Args&&... args)
98+
{
99+
std::vector<std::string> output;
100+
// Cosntruct a command here because it will be used twice and
101+
// we can't use args twice without hurting the perfect forwarding
102+
const std::string command
103+
= fmt::format(format, std::forward<Args>(args)...);
104+
const auto exitCode = unsafe::fmt(output, "{}", command);
105+
if (exitCode != 0) {
106+
throw std::runtime_error(fmt::format(
107+
"Command {} failed with exit code {}", command, exitCode));
108+
}
109+
return fmt::format("{}", fmt::join(output, "\n"));
110+
}
49111

50112
}
51113

repos/repos.conf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,9 @@ filename=xcat-deps.repo
122122
upstream.repo=http://xcat.org/files/xcat/repos/yum/devel/xcat-dep/rh{releasever}/{arch}/
123123
upstream.gpgkey=http://xcat.org/files/xcat/repos/yum/devel/xcat-dep/rh{releasever}/{arch}/repodata/repomd.xml.key
124124

125+
# ofedVersion is configured in the answerfile at ofed.version
125126
[doca]
126127
name=NVIDIA DOCA Repository - RHEL rhel{osversion}
127128
filename=mlx-doca.repo
128-
upstream.repo=https://linux.mellanox.com/public/repo/doca/latest/rhel{osversion}/{arch}/
129+
upstream.repo=https://linux.mellanox.com/public/repo/doca/{ofedVersion}/rhel{osversion}/{arch}/
129130
upstream.gpgkey=https://linux.mellanox.com/public/repo/doca/latest/rhel{osversion}/{arch}/GPG-KEY-Mellanox.pub

rpmspecs/opencattus.spec

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Name: opencattus-installer
22
Version: 1.0
3-
Release: 2
3+
Release: 4
44
Summary: OpenCATTUS Installer
55
License: Apache 2.0
66
URL: https://versatushpc.com.br/opencattus/
@@ -49,6 +49,10 @@ install -m 644 repos/rocky-vault.conf %{buildroot}/opt/cloysterhpc/conf/repos/ro
4949
/opt/cloysterhpc/conf/repos/rocky-vault.conf
5050

5151
%changelog
52+
* Thu Aug 14 2025 Daniel Hilst <daniel@versatushpc.com.br> - 1.0-4 - Bugfix
53+
- Update OFED
54+
- Dump configuration
55+
- Add support for Rocky Linux 9.6
5256
* Wed Jul 16 2025 Daniel Hilst <daniel@versatushpc.com.br> - 1.0-3 - Add ansible roles
5357
- Add ansible roles implementation
5458
- Fix dnssec configuration generation in xCAT plugin

src/diskImage.cpp

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,30 +38,43 @@ void DiskImage::setPath(const std::filesystem::path& path)
3838

3939
bool DiskImage::isKnownImage(const std::filesystem::path& path)
4040
{
41+
constexpr auto chooseDistro = [](std::string_view imageView)
42+
-> std::optional<cloyster::models::OS::Distro> {
43+
if (imageView.starts_with("Rocky")) {
44+
return cloyster::models::OS::Distro::Rocky;
45+
} else if (imageView.starts_with("rhel")) {
46+
return cloyster::models::OS::Distro::RHEL;
47+
} else if (imageView.starts_with("OracleLinux")) {
48+
return cloyster::models::OS::Distro::OL;
49+
} else if (imageView.starts_with("AlmaLinux")) {
50+
return cloyster::models::OS::Distro::AlmaLinux;
51+
} else {
52+
return std::nullopt;
53+
}
54+
};
55+
4156
for (const auto& image : m_knownImageFilename) {
4257
if (path.filename().string() == image) {
4358
LOG_TRACE("Disk image is recognized")
4459

4560
auto imageView = std::string_view(image);
46-
if (imageView.starts_with("Rocky")) {
47-
m_distro = cloyster::models::OS::Distro::Rocky;
48-
} else if (imageView.starts_with("rhel")) {
49-
m_distro = cloyster::models::OS::Distro::RHEL;
50-
} else if (imageView.starts_with("OracleLinux")) {
51-
m_distro = cloyster::models::OS::Distro::OL;
52-
} else if (imageView.starts_with("AlmaLinux")) {
53-
m_distro = cloyster::models::OS::Distro::AlmaLinux;
54-
} else {
55-
throw std::logic_error(fmt::format(
56-
"Can't determine the distro for the image {}", image));
61+
const auto distro = chooseDistro(imageView);
62+
if (distro) {
63+
m_distro = distro;
64+
return true;
5765
}
58-
59-
return true;
6066
}
6167
}
6268

63-
LOG_TRACE("Disk image is unknown. Maybe you're using a custom image or "
64-
"changed the default name?");
69+
const auto distro
70+
= chooseDistro(std::string_view(path.filename().string()));
71+
if (distro) {
72+
m_distro = distro;
73+
return true;
74+
}
75+
cloyster::functions::abort(
76+
"Disk image is unknown. Maybe you're using a custom image or "
77+
"changed the default name?");
6578
return false;
6679
}
6780

src/models/answerfile.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77
#include <boost/algorithm/string/join.hpp>
88
#include <boost/algorithm/string/split.hpp>
99
#include <boost/lexical_cast.hpp>
10+
#include <chrono>
1011
#include <cloysterhpc/functions.h>
1112
#include <cloysterhpc/models/answerfile.h>
1213
#include <cloysterhpc/services/log.h>
1314
#include <cloysterhpc/services/options.h>
15+
#include <cloysterhpc/services/osservice.h>
1416
#include <cstddef>
1517
#include <fmt/core.h>
1618
#include <iterator>
@@ -426,7 +428,24 @@ void AnswerFile::loadSystemSettings()
426428
}
427429

428430
system.version = m_keyfile.getString("system", "version");
429-
system.kernel = m_keyfile.getString("system", "kernel");
431+
const auto kernel = m_keyfile.getStringOpt("system", "kernel");
432+
if (kernel) {
433+
system.kernel = kernel.value();
434+
LOG_INFO("Kernel override in the answerfile {}", system.kernel);
435+
} else {
436+
const auto latestKernelVersion = services::runner::shell::output(
437+
// This runs very early so it stops loading all repositories caches,
438+
// which is unecessary, so I pinned --repo=appstream here
439+
"dnf list --repo=appstream kernel-devel --available "
440+
"--showduplicates | sed 1d | "
441+
// captures the kernel arch, e.g. x86_64, in $1
442+
// this is important later when used to access /lib/modules/...
443+
// folders
444+
R"(perl -lane '$F[0] =~ s/kernel-devel\.(.*)$//; printf "%s.%s\n", $F[1], $1' | tail -1)");
445+
system.kernel = latestKernelVersion;
446+
LOG_INFO("Kernel omitted in the answerfile, using running kernel {}",
447+
system.kernel);
448+
}
430449
}
431450

432451
AFNode AnswerFile::loadNode(const std::string& section)

src/ofed.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

6+
#include <fmt/core.h>
7+
68
#include <cloysterhpc/cloyster.h>
79
#include <cloysterhpc/functions.h>
810
#include <cloysterhpc/ofed.h>
@@ -47,6 +49,8 @@ bool OFED::installed() const
4749
void OFED::install() const
4850
{
4951
const auto opts = cloyster::Singleton<cloyster::services::Options>::get();
52+
const auto cluster = cloyster::Singleton<cloyster::models::Cluster>::get();
53+
const auto osinfo = cluster->getNodes()[0].getOS();
5054

5155
if (opts->dryRun) {
5256
LOG_WARN("Dry-Run: Skiping OFED installation");
@@ -73,14 +77,16 @@ void OFED::install() const
7377
cloyster::services::repos::RepoManager>::get();
7478
auto osService
7579
= cloyster::Singleton<cloyster::services::IOSService>::get();
80+
const std::string kernelVersion = std::string(osinfo.getKernel());
7681
repoManager->enable("doca");
7782
// Install the required packages
7883
runner->checkCommand("dnf makecache --repo=doca");
7984
runner->checkCommand(
80-
"dnf -y install kernel kernel-devel doca-extra");
85+
fmt::format("dnf -y install kernel-{kernelVersion} "
86+
"kernel-devel-{kernelVersion} doca-extra",
87+
fmt::arg("kernelVersion", kernelVersion)));
8188

82-
if (osService->getKernelRunning()
83-
!= osService->getKernelInstalled()) {
89+
if (osService->getKernelRunning() != kernelVersion) {
8490
LOG_WARN("New kernel installed! Rebooting after the "
8591
"installation finishes is advised!");
8692
}
@@ -96,10 +102,9 @@ void OFED::install() const
96102
// The driver may support weak updates modules and load without
97103
// need for reboot.
98104
if (!opts->shouldSkip("compile-doca-driver")) {
99-
runner->checkCommand(
100-
"bash -c \"/opt/mellanox/doca/tools/doca-kernel-support -k "
101-
"$(rpm -q --qf \"%{VERSION}-%{RELEASE}.%{ARCH}\n\" "
102-
"kernel-devel)\"");
105+
runner->checkCommand(fmt::format(
106+
"/opt/mellanox/doca/tools/doca-kernel-support -k {}",
107+
kernelVersion));
103108
}
104109

105110
// Get the last rpm in /tmp/DOCA*/ folder
@@ -123,4 +128,4 @@ void OFED::install() const
123128

124129
break;
125130
}
126-
}
131+
}

src/services/ansible/roles/base.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,18 @@ ScriptBuilder installScript(
5858

5959
builder.addNewLine().addCommand("# Install general base packages");
6060

61-
std::set<std::string> allPackages = { "wget", "curl", "dnf-plugins-core",
62-
"chkconfig", "jq", "tar", "python3-dnf-plugin-versionlock" };
61+
// "python3-dnf-plugin-versionlock" is conflicting with dnf-plugins-core
62+
// during the first install
63+
std::set<std::string> allPackages = {
64+
"wget",
65+
"curl",
66+
"dnf-plugins-core",
67+
"chkconfig",
68+
"initscripts", // @FIXME: This is only required if the provisioner is
69+
// xCAT
70+
"jq",
71+
"tar",
72+
};
6373
if (const auto iter = role.m_vars.find("base_packages");
6474
iter != role.m_vars.end()) {
6575
for (const auto& pkg :

src/services/ansible/roles/spack.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ ScriptBuilder installScript(
2525
fmt::format("Expected spack role, found {}", role.m_roleName));
2626

2727
builder.addNewLine()
28+
.addCommand("# Exit early if spack is already installed")
29+
.addCommand("test -d /opt/spack/.git && exit 0")
2830
.addCommand("# Install dependencies for Spack")
2931
.addPackage("git")
3032
.addNewLine()
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Base (open)
2+
1. confluent
3+
2. RHEL10 sup
4+
5+
Features (closed)
6+
1. ood
7+
2. grafana
8+
3. slurm
9+
4. integração ood slurm
10+
5. lustre

0 commit comments

Comments
 (0)