Skip to content
This repository was archived by the owner on Apr 16, 2026. It is now read-only.

Commit 8fd37ff

Browse files
committed
Fix OFED installation in headnode
1 parent 63b127b commit 8fd37ff

9 files changed

Lines changed: 91 additions & 25 deletions

File tree

include/cloysterhpc/functions.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@ void copyFile(std::filesystem::path source, std::filesystem::path destination);
230230
void installFile(const std::filesystem::path& path, std::istream& data);
231231
void installFile(const std::filesystem::path& path, std::string&& data);
232232

233+
bool exists(const std::filesystem::path& path);
234+
233235
} // namespace cloyster
234236

235237
/**

include/cloysterhpc/ofed.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ class OFED {
5252
* specified kind.
5353
*/
5454
void install() const;
55+
56+
/**
57+
* @brief Returns true if OFED is already installed
58+
*/
59+
[[nodiscard]] bool installed() const;
5560
};
5661

5762
#endif // CLOYSTERHPC_OFED_H_

include/cloysterhpc/services/files.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
namespace cloyster::services::files {
1212

13+
constexpr std::size_t CHUNK_SIZE = 16384;
14+
1315
template <typename File>
1416
concept IsKeyFileReadable = requires(
1517
const File& file, const std::string& group, const std::string& key) {
@@ -91,9 +93,8 @@ static_assert(cloyster::concepts::IsSaveable<KeyFile>);
9193
static_assert(concepts::IsMoveable<KeyFile>);
9294
static_assert(!concepts::IsCopyable<KeyFile>);
9395

94-
9596
std::string checksum(const std::string& data);
96-
std::string checksum(const std::filesystem::path& path, const std::size_t chunkSize = 16384);
97+
std::string checksum(const std::filesystem::path& path, const std::size_t chunkSize = CHUNK_SIZE);
9798

9899
};
99100

include/cloysterhpc/services/runner.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ namespace cloyster::services {
1717
class BaseRunner {
1818
public:
1919
virtual int executeCommand(const std::string&) = 0;
20+
virtual void checkCommand(const std::string&) = 0;
2021

2122
virtual int downloadFile(const std::string& url, const std::string& file);
2223

@@ -26,20 +27,23 @@ class BaseRunner {
2627
class Runner : public BaseRunner {
2728
public:
2829
int executeCommand(const std::string&) override;
30+
void checkCommand(const std::string&) override;
2931

3032
virtual ~Runner() = default;
3133
};
3234

3335
class DryRunner : public BaseRunner {
3436
public:
3537
int executeCommand(const std::string&) override;
38+
void checkCommand(const std::string&) override;
3639

3740
virtual ~DryRunner() = default;
3841
};
3942

4043
class MockRunner : public BaseRunner {
4144
public:
4245
int executeCommand(const std::string&) override;
46+
void checkCommand(const std::string&) override;
4347

4448
virtual ~MockRunner() = default;
4549

rpmspecs/opencattus.spec

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,5 @@ install -m 755 build/src/cloysterhpc %{buildroot}/usr/bin/cloysterhpc
3636
/usr/bin/cloysterhpc
3737

3838
%changelog
39-
* Tue Feb 25 2025 Daniel Hilst <danielhilst@versatushpc.com> - 1.0-1
39+
* Tue Feb 25 2025 Daniel Hilst <daniel@versatushpc.com.br> - 1.0-1
4040
- Initial release

src/functions.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,4 +380,9 @@ void installFile(const std::filesystem::path& path, std::string&& data)
380380
installFile(path, stringData);
381381
}
382382

383+
bool exists(const std::filesystem::path& path)
384+
{
385+
return std::filesystem::exists(path);
386+
}
387+
383388
}; // namespace cloyster

src/ofed.cpp

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,22 @@
88
#include <cloysterhpc/ofed.h>
99
#include <utility>
1010

11+
using cloyster::BaseRunner;
1112
using cloyster::runCommand;
1213

1314
namespace {
1415

15-
auto docaRepoTemplate(std::string version, std::string distro)
16+
auto docaRepoTemplate(std::string version, std::string distro, std::string arch)
1617
{
1718
static constexpr std::string_view templ = R"(
1819
[doca]
1920
name=NVIDIA DOCA Repository - RHEL {1}
20-
baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/x86_64/
21+
baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2}/
2122
enabled=1
2223
gpgcheck=1
23-
gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/GPG-KEY-Mellanox
24+
gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2}/GPG-KEY-Mellanox.pub
2425
)";
25-
std::istringstream data(fmt::format(templ, version, distro));
26+
std::istringstream data(fmt::format(templ, version, distro, arch));
2627
return data;
2728
}
2829

@@ -58,25 +59,50 @@ void OFED::setKind(Kind kind) { m_kind = kind; }
5859

5960
OFED::Kind OFED::getKind() const { return m_kind; }
6061

62+
bool OFED::installed() const
63+
{
64+
if (cloyster::getEnvironmentVariable("CATTUS_FORCE_INFINIBAND_INSTALL") == "1") {
65+
return false;
66+
}
67+
68+
if (!cloyster::dryRun) {
69+
return false;
70+
}
71+
72+
auto runner = cloyster::Singleton<BaseRunner>::get();
73+
switch (m_kind) {
74+
case OFED::Kind::Mellanox:
75+
return cloyster::exists("/opt/mellanox/doca/tools/doca-kernel-support");
76+
case OFED::Kind::Inbox:
77+
return runner->executeCommand("dnf group info \"Infiniband Support\"") == 0;
78+
case OFED::Kind::Oracle:
79+
throw std::logic_error("Not implemented");
80+
}
81+
82+
std::unreachable();
83+
}
84+
6185
void OFED::install() const
6286
{
87+
// Idempotency check
88+
if (installed()) {
89+
return;
90+
}
91+
6392
switch (m_kind) {
6493
case OFED::Kind::Inbox:
6594
runCommand("dnf -y groupinstall \"Infiniband Support\"");
66-
6795
break;
6896

6997
case OFED::Kind::Mellanox:
7098
{
99+
auto cluster = cloyster::Singleton<cloyster::models::Cluster>::get();
71100
auto runner = cloyster::Singleton<cloyster::services::BaseRunner>::get();
72101
auto repoManager = cloyster::Singleton<cloyster::services::repos::RepoManager>::get();
73102

74-
if (runner->executeCommand("modprobe mlx5_core") == 0) {
75-
LOG_WARN("mlx5_core module loaded, skiping DOCA setup");
76-
return;
77-
}
78-
79-
auto repoData = docaRepoTemplate(getVersion(), headnodeDistroName());
103+
auto repoData = docaRepoTemplate(
104+
getVersion(), headnodeDistroName(),
105+
cloyster::utils::enumToString(cluster->getHeadnode().getOS().getArch()));
80106
std::filesystem::path path = "/etc/yum.repos.d/mlx-doca.repo";
81107

82108
// Install the repository and enable it
@@ -85,21 +111,23 @@ void OFED::install() const
85111
repoManager->enable("doca");
86112

87113
// Install the required packages
88-
runner->executeCommand("dnf makecache");
89-
runner->executeCommand("dnf install –y kernel kernel-devel doca-extra");
114+
runner->checkCommand("dnf makecache");
115+
runner->checkCommand("dnf -y install kernel kernel-devel doca-extra");
90116

91-
// Run the Mellanox script, this generates an RPM at tmp
92-
assert(runner->executeCommand("/opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \"%{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel") == 0);
117+
LOG_INFO("Compiling OFED DOCA drivers, this may take a while");
118+
// Run the Mellanox script, this generates an RPM at tmp.
119+
//
120+
// Use the kernel-devel version instead of the booted kernel
121+
// version, this is handle the case where a new kernel is
122+
// installed but no reboot was done yet
123+
runner->checkCommand("bash -c \"/opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \"%{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel)\"");
93124

94125
// Install the (last) generated rpm
95-
runner->executeCommand("rpm -ivh $(find /tmp/DOCA.*/ -name '*.rpm' -printf \"%T@ %p\n\" | sort -nrk1 | tail -1 | awk '{print $2}')");
96-
97-
runner->executeCommand("dnf makecache");
98-
runner->executeCommand("dnf install –y kernel kernel-devel doca-extra");
99-
if (runner->executeCommand("lsmod | grep mlx5_core") != 0) {
100-
runner->executeCommand("modprobe mlx_core");
101-
}
126+
runner->checkCommand("bash -c $'rpm -vih $(find /tmp/DOCA*/ -name \'*.rpm\' -printf \'%T@ %p\n\' | sort -nk1 | tail -1 | awk \'{print $2}\')");
102127

128+
runner->checkCommand("dnf makecache");
129+
runner->checkCommand("dnf -y install kernel kernel-devel doca-extra");
130+
runner->checkCommand("modprobe mlx_core");
103131
}
104132
break;
105133

src/services/runner.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,35 @@ int Runner::executeCommand(const std::string& cmd)
1818
return cloyster::runCommand(cmd);
1919
}
2020

21+
void Runner::checkCommand(const std::string& cmd)
22+
{
23+
if (cloyster::runCommand(cmd) != 0) {
24+
throw std::runtime_error(fmt::format("ERROR: Command failed '{}'", cmd));
25+
}
26+
}
27+
28+
2129
int DryRunner::executeCommand(const std::string& cmd)
2230
{
2331
LOG_WARN("Dry Run: Would execute command: {}", cmd);
2432
return OK;
2533
}
2634

35+
void DryRunner::checkCommand(const std::string& cmd)
36+
{
37+
LOG_WARN("Dry Run: Would execute command: {}", cmd);
38+
}
39+
2740
int MockRunner::executeCommand(const std::string& cmd)
2841
{
2942
m_cmds.push_back(cmd);
3043
return OK;
3144
}
3245

46+
void MockRunner::checkCommand(const std::string& cmd)
47+
{
48+
}
49+
3350
const std::vector<std::string>& MockRunner::listCommands() const
3451
{
3552
return m_cmds;

src/services/shell.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,10 @@ void Shell::install()
454454
installOpenHPCBase();
455455
configureInfiniband();
456456

457+
LOG_INFO("INIFINIBAND SETUP FINISHED");
458+
// @FIXME: Remove this
459+
std::exit(0);
460+
457461
// BUG: Broken. Compute nodes does not mount anything.
458462
NFS networkFileSystem = NFS(systemdBus, "pub", "/opt/ohpc",
459463
cluster()->getHeadnode()

0 commit comments

Comments
 (0)