88#include < cloysterhpc/ofed.h>
99#include < utility>
1010
11+ using cloyster::BaseRunner;
1112using cloyster::runCommand;
1213
1314namespace {
1415
15- auto docaRepoTemplate (std::string version, std::string distro)
16+ auto docaRepoTemplate (std::string version, std::string distro, std::string arch )
1617{
1718 static constexpr std::string_view templ = R"(
1819[doca]
1920name=NVIDIA DOCA Repository - RHEL {1}
20- baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/x86_64 /
21+ baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2} /
2122enabled=1
2223gpgcheck=1
23- gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/GPG-KEY-Mellanox
24+ gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2}/ GPG-KEY-Mellanox.pub
2425)" ;
25- std::istringstream data (fmt::format (templ, version, distro));
26+ std::istringstream data (fmt::format (templ, version, distro, arch ));
2627 return data;
2728}
2829
@@ -58,25 +59,51 @@ void OFED::setKind(Kind kind) { m_kind = kind; }
5859
5960OFED::Kind OFED::getKind () const { return m_kind; }
6061
62+ bool OFED::installed () const
63+ {
64+ if (cloyster::getEnvironmentVariable (" CATTUS_FORCE_INFINIBAND_INSTALL" ) == " 1" ) {
65+ return false ;
66+ }
67+
68+ // Return false so the installation runs on dry run
69+ if (cloyster::dryRun) {
70+ return false ;
71+ }
72+
73+ auto runner = cloyster::Singleton<BaseRunner>::get ();
74+ switch (m_kind) {
75+ case OFED::Kind::Mellanox:
76+ return cloyster::exists (" /opt/mellanox/doca/tools/doca-kernel-support" );
77+ case OFED::Kind::Inbox:
78+ return runner->executeCommand (" dnf group info \" Infiniband Support\" " ) == 0 ;
79+ case OFED::Kind::Oracle:
80+ throw std::logic_error (" Not implemented" );
81+ }
82+
83+ std::unreachable ();
84+ }
85+
6186void OFED::install () const
6287{
88+ // Idempotency check
89+ if (installed ()) {
90+ return ;
91+ }
92+
6393 switch (m_kind) {
6494 case OFED::Kind::Inbox:
6595 runCommand (" dnf -y groupinstall \" Infiniband Support\" " );
66-
6796 break ;
6897
6998 case OFED::Kind::Mellanox:
7099 {
100+ auto cluster = cloyster::Singleton<cloyster::models::Cluster>::get ();
71101 auto runner = cloyster::Singleton<cloyster::services::BaseRunner>::get ();
72102 auto repoManager = cloyster::Singleton<cloyster::services::repos::RepoManager>::get ();
73103
74- if (runner->executeCommand (" modprobe mlx5_core" ) == 0 ) {
75- LOG_WARN (" mlx5_core module loaded, skiping DOCA setup" );
76- return ;
77- }
78-
79- auto repoData = docaRepoTemplate (getVersion (), headnodeDistroName ());
104+ auto repoData = docaRepoTemplate (
105+ getVersion (), headnodeDistroName (),
106+ cloyster::utils::enumToString (cluster->getHeadnode ().getOS ().getArch ()));
80107 std::filesystem::path path = " /etc/yum.repos.d/mlx-doca.repo" ;
81108
82109 // Install the repository and enable it
@@ -85,21 +112,33 @@ void OFED::install() const
85112 repoManager->enable (" doca" );
86113
87114 // Install the required packages
88- runner->executeCommand (" dnf makecache" );
89- runner->executeCommand (" dnf install –y kernel kernel-devel doca-extra" );
115+ runner->checkCommand (" dnf makecache" );
116+ runner->checkCommand (" dnf -y install kernel kernel-devel doca-extra" );
117+
118+ LOG_INFO (" Compiling OFED DOCA drivers, this may take a while" );
119+ // Run the Mellanox script, this generates an RPM at tmp.
120+ //
121+ // Use the kernel-devel version instead of the booted kernel
122+ // version, this is to handle the case where a new kernel is
123+ // installed but no reboot was done yet. After compiling the
124+ // drivers the headnode should be rebooted to reload the new kernel.
125+ // The driver may support weak updates modules and load without
126+ // need for reboot.
127+ if (cloyster::getEnvironmentVariable (" CATTUS_SKIP_INFINIBAND_COMPILE_DOCA_DRIVER" ) != " 1" ) {
128+ runner->checkCommand (" bash -c \" /opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \" %{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel)\" " );
129+ }
90130
91- // Run the Mellanox script, this generates an RPM at tmp
92- assert (runner->executeCommand (" /opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \" %{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel" ) == 0 );
131+ // Get the last rpm in /tmp/DOCA*/ folder
132+ auto rpm = runner->checkOutput (" bash -c \" find /tmp/DOCA*/ -name '*.rpm' -printf '%T@ %p\n ' | sort -nk1 | tail -1 | awk '{print $2}'\" " );
133+ assert (rpm.size () > 0 ); // at last one line
93134
94135 // Install the (last) generated rpm
95- runner->executeCommand (" rpm -ivh $(find /tmp/DOCA.*/ -name '*.rpm' -printf \" %T@ %p\n\" | sort -nrk1 | tail -1 | awk '{print $2}')" );
96-
97- runner->executeCommand (" dnf makecache" );
98- runner->executeCommand (" dnf install –y kernel kernel-devel doca-extra" );
99- if (runner->executeCommand (" lsmod | grep mlx5_core" ) != 0 ) {
100- runner->executeCommand (" modprobe mlx_core" );
101- }
136+ runner->executeCommand (fmt::format (" rpm -vih {}" , rpm[0 ]));
102137
138+ runner->checkCommand (" dnf makecache" );
139+ // @NOTE: Are these packages correct/good default?
140+ runner->checkCommand (" dnf install -y doca-ofed mlnx-fw-updater" );
141+ runner->checkCommand (" modprobe mlx5_core" );
103142 }
104143 break ;
105144
0 commit comments