Skip to content
This repository was archived by the owner on Apr 16, 2026. It is now read-only.

Commit 16be86e

Browse files
committed
Fix OFED installation in headnode
1 parent 63b127b commit 16be86e

10 files changed

Lines changed: 151 additions & 34 deletions

File tree

include/cloysterhpc/functions.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@ void copyFile(std::filesystem::path source, std::filesystem::path destination);
230230
void installFile(const std::filesystem::path& path, std::istream& data);
231231
void installFile(const std::filesystem::path& path, std::string&& data);
232232

233+
bool exists(const std::filesystem::path& path);
234+
233235
} // namespace cloyster
234236

235237
/**

include/cloysterhpc/ofed.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ class OFED {
5252
* specified kind.
5353
*/
5454
void install() const;
55+
56+
/**
57+
* @brief Returns true if OFED is already installed
58+
*/
59+
[[nodiscard]] bool installed() const;
5560
};
5661

5762
#endif // CLOYSTERHPC_OFED_H_

include/cloysterhpc/services/files.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
namespace cloyster::services::files {
1212

13+
constexpr std::size_t CHUNK_SIZE = 16384;
14+
1315
template <typename File>
1416
concept IsKeyFileReadable = requires(
1517
const File& file, const std::string& group, const std::string& key) {
@@ -91,9 +93,8 @@ static_assert(cloyster::concepts::IsSaveable<KeyFile>);
9193
static_assert(concepts::IsMoveable<KeyFile>);
9294
static_assert(!concepts::IsCopyable<KeyFile>);
9395

94-
9596
std::string checksum(const std::string& data);
96-
std::string checksum(const std::filesystem::path& path, const std::size_t chunkSize = 16384);
97+
std::string checksum(const std::filesystem::path& path, const std::size_t chunkSize = CHUNK_SIZE);
9798

9899
};
99100

include/cloysterhpc/services/runner.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ namespace cloyster::services {
1717
class BaseRunner {
1818
public:
1919
virtual int executeCommand(const std::string&) = 0;
20+
virtual void checkCommand(const std::string&) = 0;
21+
virtual std::vector<std::string> checkOutput(const std::string&) = 0;
2022

2123
virtual int downloadFile(const std::string& url, const std::string& file);
2224

@@ -26,20 +28,26 @@ class BaseRunner {
2628
class Runner : public BaseRunner {
2729
public:
2830
int executeCommand(const std::string&) override;
31+
void checkCommand(const std::string&) override;
32+
std::vector<std::string> checkOutput(const std::string&) override;
2933

3034
virtual ~Runner() = default;
3135
};
3236

3337
class DryRunner : public BaseRunner {
3438
public:
3539
int executeCommand(const std::string&) override;
40+
void checkCommand(const std::string&) override;
41+
std::vector<std::string> checkOutput(const std::string&) override;
3642

3743
virtual ~DryRunner() = default;
3844
};
3945

4046
class MockRunner : public BaseRunner {
4147
public:
4248
int executeCommand(const std::string&) override;
49+
void checkCommand(const std::string&) override;
50+
std::vector<std::string> checkOutput(const std::string&) override;
4351

4452
virtual ~MockRunner() = default;
4553

rpmspecs/opencattus.spec

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,5 @@ install -m 755 build/src/cloysterhpc %{buildroot}/usr/bin/cloysterhpc
3636
/usr/bin/cloysterhpc
3737

3838
%changelog
39-
* Tue Feb 25 2025 Daniel Hilst <danielhilst@versatushpc.com> - 1.0-1
39+
* Tue Feb 25 2025 Daniel Hilst <daniel@versatushpc.com.br> - 1.0-1
4040
- Initial release

src/functions.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,4 +380,9 @@ void installFile(const std::filesystem::path& path, std::string&& data)
380380
installFile(path, stringData);
381381
}
382382

383+
bool exists(const std::filesystem::path& path)
384+
{
385+
return std::filesystem::exists(path);
386+
}
387+
383388
}; // namespace cloyster

src/main.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,13 @@ int main(int argc, const char** argv)
125125
"-u, --unattended", unattended, "Perform an unattended installation");
126126

127127
#ifndef NDEBUG
128-
std::string loadConfFile {};
128+
std::string loadConfFile{};
129129
app.add_option("--test-conf-file", loadConfFile,
130130
"Hook for testing configuration file loading");
131131

132+
std::string testCommand{};
133+
app.add_option("--test-command", testCommand,
134+
"Run a command for testing purposes");
132135
#endif
133136

134137
CLI11_PARSE(app, argc, argv)
@@ -229,6 +232,16 @@ int main(int argc, const char** argv)
229232

230233
initializeSingletons(std::move(model));
231234

235+
#ifndef NDEBUG
236+
if (!testCommand.empty()) {
237+
LOG_INFO("Running test command {}", testCommand);
238+
auto runner = cloyster::Singleton<cloyster::BaseRunner>::get();
239+
runner->checkCommand(testCommand);
240+
return EXIT_SUCCESS;
241+
}
242+
#endif
243+
244+
232245
std::unique_ptr<Execution> executionEngine
233246
= std::make_unique<cloyster::services::Shell>();
234247

src/ofed.cpp

Lines changed: 68 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,22 @@
88
#include <cloysterhpc/ofed.h>
99
#include <utility>
1010

11+
using cloyster::BaseRunner;
1112
using cloyster::runCommand;
1213

1314
namespace {
1415

15-
auto docaRepoTemplate(std::string version, std::string distro)
16+
auto docaRepoTemplate(std::string version, std::string distro, std::string arch)
1617
{
1718
static constexpr std::string_view templ = R"(
1819
[doca]
1920
name=NVIDIA DOCA Repository - RHEL {1}
20-
baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/x86_64/
21+
baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2}/
2122
enabled=1
2223
gpgcheck=1
23-
gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/GPG-KEY-Mellanox
24+
gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2}/GPG-KEY-Mellanox.pub
2425
)";
25-
std::istringstream data(fmt::format(templ, version, distro));
26+
std::istringstream data(fmt::format(templ, version, distro, arch));
2627
return data;
2728
}
2829

@@ -58,25 +59,50 @@ void OFED::setKind(Kind kind) { m_kind = kind; }
5859

5960
OFED::Kind OFED::getKind() const { return m_kind; }
6061

62+
bool OFED::installed() const
63+
{
64+
if (cloyster::getEnvironmentVariable("CATTUS_FORCE_INFINIBAND_INSTALL") == "1") {
65+
return false;
66+
}
67+
68+
if (!cloyster::dryRun) {
69+
return false;
70+
}
71+
72+
auto runner = cloyster::Singleton<BaseRunner>::get();
73+
switch (m_kind) {
74+
case OFED::Kind::Mellanox:
75+
return cloyster::exists("/opt/mellanox/doca/tools/doca-kernel-support");
76+
case OFED::Kind::Inbox:
77+
return runner->executeCommand("dnf group info \"Infiniband Support\"") == 0;
78+
case OFED::Kind::Oracle:
79+
throw std::logic_error("Not implemented");
80+
}
81+
82+
std::unreachable();
83+
}
84+
6185
void OFED::install() const
6286
{
87+
// Idempotency check
88+
if (installed()) {
89+
return;
90+
}
91+
6392
switch (m_kind) {
6493
case OFED::Kind::Inbox:
6594
runCommand("dnf -y groupinstall \"Infiniband Support\"");
66-
6795
break;
6896

6997
case OFED::Kind::Mellanox:
7098
{
99+
auto cluster = cloyster::Singleton<cloyster::models::Cluster>::get();
71100
auto runner = cloyster::Singleton<cloyster::services::BaseRunner>::get();
72101
auto repoManager = cloyster::Singleton<cloyster::services::repos::RepoManager>::get();
73102

74-
if (runner->executeCommand("modprobe mlx5_core") == 0) {
75-
LOG_WARN("mlx5_core module loaded, skiping DOCA setup");
76-
return;
77-
}
78-
79-
auto repoData = docaRepoTemplate(getVersion(), headnodeDistroName());
103+
auto repoData = docaRepoTemplate(
104+
getVersion(), headnodeDistroName(),
105+
cloyster::utils::enumToString(cluster->getHeadnode().getOS().getArch()));
80106
std::filesystem::path path = "/etc/yum.repos.d/mlx-doca.repo";
81107

82108
// Install the repository and enable it
@@ -85,21 +111,40 @@ void OFED::install() const
85111
repoManager->enable("doca");
86112

87113
// Install the required packages
88-
runner->executeCommand("dnf makecache");
89-
runner->executeCommand("dnf install –y kernel kernel-devel doca-extra");
114+
runner->checkCommand("dnf makecache");
115+
runner->checkCommand("dnf -y install kernel kernel-devel doca-extra");
116+
117+
LOG_INFO("Compiling OFED DOCA drivers, this may take a while");
118+
// Run the Mellanox script, this generates an RPM at tmp.
119+
//
120+
// Use the kernel-devel version instead of the booted kernel
121+
// version, this is to handle the case where a new kernel is
122+
// installed but no reboot was done yet. After compiling the
123+
// drivers the headnode should be rebooted to reload the new kernel.
124+
// The driver may support weak updates modules and load without
125+
// need for reboot.
126+
if (cloyster::getEnvironmentVariable("CATTUS_SKIP_INFINIBAND_COMPILE_DOCA_DRIVER") != "1") {
127+
runner->checkCommand("bash -c \"/opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \"%{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel)\"");
128+
}
90129

91-
// Run the Mellanox script, this generates an RPM at tmp
92-
assert(runner->executeCommand("/opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \"%{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel") == 0);
130+
// Get the last rpm in /tmp/DOCA*/ folder
131+
auto rpm = runner->checkOutput("bash -c \"find /tmp/DOCA*/ -name '*.rpm' -printf '%T@ %p\n' | sort -nk1 | tail -1 | awk '{print $2}'\"");
132+
assert(rpm.size() > 0); // at last one line
93133

94134
// Install the (last) generated rpm
95-
runner->executeCommand("rpm -ivh $(find /tmp/DOCA.*/ -name '*.rpm' -printf \"%T@ %p\n\" | sort -nrk1 | tail -1 | awk '{print $2}')");
96-
97-
runner->executeCommand("dnf makecache");
98-
runner->executeCommand("dnf install –y kernel kernel-devel doca-extra");
99-
if (runner->executeCommand("lsmod | grep mlx5_core") != 0) {
100-
runner->executeCommand("modprobe mlx_core");
101-
}
102-
135+
runner->executeCommand(fmt::format("rpm -vih {}", rpm[0]));
136+
137+
runner->checkCommand("dnf makecache");
138+
// @NOTE: Are these packages correct/good default?
139+
runner->checkCommand("dnf install -y \
140+
kmod-mlnx-ofa_kernel \
141+
mlnx-ofa_kernel \
142+
mlnx-tools \
143+
xpmem \
144+
kmod-iser \
145+
kmod-srp \
146+
mlnx-ofa_kernel-devel");
147+
runner->checkCommand("modprobe mlx5_core");
103148
}
104149
break;
105150

src/services/runner.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <cloysterhpc/services/runner.h>
55

66
#include <fmt/format.h>
7+
#include <ranges>
78

89
namespace cloyster::services {
910

@@ -18,18 +19,55 @@ int Runner::executeCommand(const std::string& cmd)
1819
return cloyster::runCommand(cmd);
1920
}
2021

22+
void Runner::checkCommand(const std::string& cmd)
23+
{
24+
if (cloyster::runCommand(cmd) != 0) {
25+
throw std::runtime_error(fmt::format("ERROR: Command failed '{}'", cmd));
26+
}
27+
}
28+
29+
std::vector<std::string> Runner::checkOutput(const std::string& cmd)
30+
{
31+
std::list<std::string> output;
32+
if (cloyster::runCommand(cmd, output) != 0) {
33+
throw std::runtime_error(fmt::format("ERROR: Command failed '{}'", cmd));
34+
}
35+
return output | std::ranges::to<std::vector>();
36+
}
37+
38+
2139
int DryRunner::executeCommand(const std::string& cmd)
2240
{
2341
LOG_WARN("Dry Run: Would execute command: {}", cmd);
2442
return OK;
2543
}
2644

45+
void DryRunner::checkCommand(const std::string& cmd)
46+
{
47+
LOG_WARN("Dry Run: Would execute command: {}", cmd);
48+
}
49+
50+
std::vector<std::string> DryRunner::checkOutput(const std::string& cmd)
51+
{
52+
LOG_WARN("Dry Run: Would check output of command: {}", cmd);
53+
return {};
54+
}
55+
2756
int MockRunner::executeCommand(const std::string& cmd)
2857
{
2958
m_cmds.push_back(cmd);
3059
return OK;
3160
}
3261

62+
void MockRunner::checkCommand(const std::string& cmd)
63+
{
64+
}
65+
66+
std::vector<std::string> MockRunner::checkOutput(const std::string& cmd)
67+
{
68+
return {};
69+
}
70+
3371
const std::vector<std::string>& MockRunner::listCommands() const
3472
{
3573
return m_cmds;

src/services/xcat.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -235,20 +235,20 @@ void XCAT::configureInfiniband()
235235
IndexOptions FancyIndexing VersionSort NameWidth=* HTMLTable Charset=UTF-8
236236
</Directory>
237237
)", repoFolder));
238-
runner->executeCommand("apachectl configtest");
239-
runner->executeCommand("systemctl restart httpd");
238+
runner->checkCommand("apachectl configtest");
239+
runner->checkCommand("systemctl restart httpd");
240240

241241
// Create the RPM repository
242-
runner->executeCommand(
242+
runner->checkCommand(
243243
fmt::format("bash -c \"cp -v /usr/share/doca-host-*/Modules/*.{}/*.rpm {}\"",
244244
arch, repoFolder));
245-
runner->executeCommand(
245+
runner->checkCommand(
246246
fmt::format("createrepo {}", repoFolder));
247247

248248

249-
// Add the repository to the image
250-
runner->executeCommand(
251-
fmt::format("bash -c \"chdef -t osimage {} --plus otherpkgdir=http://$(hostname)/rpmrepo\"",
249+
// Add the repository to the stateless image
250+
runner->checkCommand(
251+
fmt::format("bash -c \"chdef -t osimage {} --plus otherpkgdir=http://localhost/rpmrepo/\"",
252252
m_stateless.osimage));
253253
}
254254
break;

0 commit comments

Comments
 (0)