88#include < cloysterhpc/ofed.h>
99#include < utility>
1010
11+ using cloyster::BaseRunner;
1112using cloyster::runCommand;
1213
1314namespace {
1415
15- auto docaRepoTemplate (std::string version, std::string distro)
16+ auto docaRepoTemplate (std::string version, std::string distro, std::string arch )
1617{
1718 static constexpr std::string_view templ = R"(
1819[doca]
1920name=NVIDIA DOCA Repository - RHEL {1}
20- baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/x86_64 /
21+ baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2} /
2122enabled=1
2223gpgcheck=1
23- gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/GPG-KEY-Mellanox
24+ gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2}/ GPG-KEY-Mellanox.pub
2425)" ;
25- std::istringstream data (fmt::format (templ, version, distro));
26+ std::istringstream data (fmt::format (templ, version, distro, arch ));
2627 return data;
2728}
2829
@@ -58,25 +59,50 @@ void OFED::setKind(Kind kind) { m_kind = kind; }
5859
5960OFED::Kind OFED::getKind () const { return m_kind; }
6061
62+ bool OFED::installed () const
63+ {
64+ if (cloyster::getEnvironmentVariable (" CATTUS_FORCE_INFINIBAND_INSTALL" ) == " 1" ) {
65+ return false ;
66+ }
67+
68+ if (!cloyster::dryRun) {
69+ return false ;
70+ }
71+
72+ auto runner = cloyster::Singleton<BaseRunner>::get ();
73+ switch (m_kind) {
74+ case OFED::Kind::Mellanox:
75+ return cloyster::exists (" /opt/mellanox/doca/tools/doca-kernel-support" );
76+ case OFED::Kind::Inbox:
77+ return runner->executeCommand (" dnf group info \" Infiniband Support\" " ) == 0 ;
78+ case OFED::Kind::Oracle:
79+ throw std::logic_error (" Not implemented" );
80+ }
81+
82+ std::unreachable ();
83+ }
84+
6185void OFED::install () const
6286{
87+ // Idempotency check
88+ if (installed ()) {
89+ return ;
90+ }
91+
6392 switch (m_kind) {
6493 case OFED::Kind::Inbox:
6594 runCommand (" dnf -y groupinstall \" Infiniband Support\" " );
66-
6795 break ;
6896
6997 case OFED::Kind::Mellanox:
7098 {
99+ auto cluster = cloyster::Singleton<cloyster::models::Cluster>::get ();
71100 auto runner = cloyster::Singleton<cloyster::services::BaseRunner>::get ();
72101 auto repoManager = cloyster::Singleton<cloyster::services::repos::RepoManager>::get ();
73102
74- if (runner->executeCommand (" modprobe mlx5_core" ) == 0 ) {
75- LOG_WARN (" mlx5_core module loaded, skiping DOCA setup" );
76- return ;
77- }
78-
79- auto repoData = docaRepoTemplate (getVersion (), headnodeDistroName ());
103+ auto repoData = docaRepoTemplate (
104+ getVersion (), headnodeDistroName (),
105+ cloyster::utils::enumToString (cluster->getHeadnode ().getOS ().getArch ()));
80106 std::filesystem::path path = " /etc/yum.repos.d/mlx-doca.repo" ;
81107
82108 // Install the repository and enable it
@@ -85,21 +111,40 @@ void OFED::install() const
85111 repoManager->enable (" doca" );
86112
87113 // Install the required packages
88- runner->executeCommand (" dnf makecache" );
89- runner->executeCommand (" dnf install –y kernel kernel-devel doca-extra" );
114+ runner->checkCommand (" dnf makecache" );
115+ runner->checkCommand (" dnf -y install kernel kernel-devel doca-extra" );
116+
117+ LOG_INFO (" Compiling OFED DOCA drivers, this may take a while" );
118+ // Run the Mellanox script, this generates an RPM at tmp.
119+ //
120+ // Use the kernel-devel version instead of the booted kernel
121+ // version, this is to handle the case where a new kernel is
122+ // installed but no reboot was done yet. After compiling the
123+ // drivers the headnode should be rebooted to reload the new kernel.
124+ // The driver may support weak updates modules and load without
125+ // need for reboot.
126+ if (cloyster::getEnvironmentVariable (" CATTUS_SKIP_INFINIBAND_COMPILE_DOCA_DRIVER" ) != " 1" ) {
127+ runner->checkCommand (" bash -c \" /opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \" %{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel)\" " );
128+ }
90129
91- // Run the Mellanox script, this generates an RPM at tmp
92- assert (runner->executeCommand (" /opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \" %{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel" ) == 0 );
130+ // Get the last rpm in /tmp/DOCA*/ folder
131+ auto rpm = runner->checkOutput (" bash -c \" find /tmp/DOCA*/ -name '*.rpm' -printf '%T@ %p\n ' | sort -nk1 | tail -1 | awk '{print $2}'\" " );
132+ assert (rpm.size () > 0 ); // at last one line
93133
94134 // Install the (last) generated rpm
95- runner->executeCommand (" rpm -ivh $(find /tmp/DOCA.*/ -name '*.rpm' -printf \" %T@ %p\n\" | sort -nrk1 | tail -1 | awk '{print $2}')" );
96-
97- runner->executeCommand (" dnf makecache" );
98- runner->executeCommand (" dnf install –y kernel kernel-devel doca-extra" );
99- if (runner->executeCommand (" lsmod | grep mlx5_core" ) != 0 ) {
100- runner->executeCommand (" modprobe mlx_core" );
101- }
102-
135+ runner->executeCommand (fmt::format (" rpm -vih {}" , rpm[0 ]));
136+
137+ runner->checkCommand (" dnf makecache" );
138+ // @NOTE: Are these packages correct/good default?
139+ runner->checkCommand (" dnf install -y \
140+ kmod-mlnx-ofa_kernel \
141+ mlnx-ofa_kernel \
142+ mlnx-tools \
143+ xpmem \
144+ kmod-iser \
145+ kmod-srp \
146+ mlnx-ofa_kernel-devel" );
147+ runner->checkCommand (" modprobe mlx5_core" );
103148 }
104149 break ;
105150
0 commit comments