88#include < cloysterhpc/ofed.h>
99#include < utility>
1010
11+ using cloyster::BaseRunner;
1112using cloyster::runCommand;
1213
1314namespace {
1415
15- auto docaRepoTemplate (std::string version, std::string distro)
16+ auto docaRepoTemplate (std::string version, std::string distro, std::string arch )
1617{
1718 static constexpr std::string_view templ = R"(
1819[doca]
1920name=NVIDIA DOCA Repository - RHEL {1}
20- baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/x86_64 /
21+ baseurl=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2} /
2122enabled=1
2223gpgcheck=1
23- gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/GPG-KEY-Mellanox
24+ gpgkey=https://linux.mellanox.com/public/repo/doca/{0}/{1}/{2}/ GPG-KEY-Mellanox.pub
2425)" ;
25- std::istringstream data (fmt::format (templ, version, distro));
26+ std::istringstream data (fmt::format (templ, version, distro, arch ));
2627 return data;
2728}
2829
@@ -58,25 +59,50 @@ void OFED::setKind(Kind kind) { m_kind = kind; }
5859
5960OFED::Kind OFED::getKind () const { return m_kind; }
6061
62+ bool OFED::installed () const
63+ {
64+ if (cloyster::getEnvironmentVariable (" CATTUS_FORCE_INFINIBAND_INSTALL" ) == " 1" ) {
65+ return false ;
66+ }
67+
68+ if (!cloyster::dryRun) {
69+ return false ;
70+ }
71+
72+ auto runner = cloyster::Singleton<BaseRunner>::get ();
73+ switch (m_kind) {
74+ case OFED::Kind::Mellanox:
75+ return cloyster::exists (" /opt/mellanox/doca/tools/doca-kernel-support" );
76+ case OFED::Kind::Inbox:
77+ return runner->executeCommand (" dnf group info \" Infiniband Support\" " ) == 0 ;
78+ case OFED::Kind::Oracle:
79+ throw std::logic_error (" Not implemented" );
80+ }
81+
82+ std::unreachable ();
83+ }
84+
6185void OFED::install () const
6286{
87+ // Idempotency check
88+ if (installed ()) {
89+ return ;
90+ }
91+
6392 switch (m_kind) {
6493 case OFED::Kind::Inbox:
6594 runCommand (" dnf -y groupinstall \" Infiniband Support\" " );
66-
6795 break ;
6896
6997 case OFED::Kind::Mellanox:
7098 {
99+ auto cluster = cloyster::Singleton<cloyster::models::Cluster>::get ();
71100 auto runner = cloyster::Singleton<cloyster::services::BaseRunner>::get ();
72101 auto repoManager = cloyster::Singleton<cloyster::services::repos::RepoManager>::get ();
73102
74- if (runner->executeCommand (" modprobe mlx5_core" ) == 0 ) {
75- LOG_WARN (" mlx5_core module loaded, skiping DOCA setup" );
76- return ;
77- }
78-
79- auto repoData = docaRepoTemplate (getVersion (), headnodeDistroName ());
103+ auto repoData = docaRepoTemplate (
104+ getVersion (), headnodeDistroName (),
105+ cloyster::utils::enumToString (cluster->getHeadnode ().getOS ().getArch ()));
80106 std::filesystem::path path = " /etc/yum.repos.d/mlx-doca.repo" ;
81107
82108 // Install the repository and enable it
@@ -85,21 +111,23 @@ void OFED::install() const
85111 repoManager->enable (" doca" );
86112
87113 // Install the required packages
88- runner->executeCommand (" dnf makecache" );
89- runner->executeCommand (" dnf install –y kernel kernel-devel doca-extra" );
114+ runner->checkCommand (" dnf makecache" );
115+ runner->checkCommand (" dnf -y install kernel kernel-devel doca-extra" );
90116
91- // Run the Mellanox script, this generates an RPM at tmp
92- assert (runner->executeCommand (" /opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \" %{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel" ) == 0 );
117+ LOG_INFO (" Compiling OFED DOCA drivers, this may take a while" );
118+ // Run the Mellanox script, this generates an RPM at tmp.
119+ //
120+ // Use the kernel-devel version instead of the booted kernel
121+ // version, this is handle the case where a new kernel is
122+ // installed but no reboot was done yet
123+ runner->checkCommand (" bash -c \" /opt/mellanox/doca/tools/doca-kernel-support -k $(rpm -q --qf \" %{VERSION}-%{RELEASE}.%{ARCH}\n\" kernel-devel)\" " );
93124
94125 // Install the (last) generated rpm
95- runner->executeCommand (" rpm -ivh $(find /tmp/DOCA.*/ -name '*.rpm' -printf \" %T@ %p\n\" | sort -nrk1 | tail -1 | awk '{print $2}')" );
96-
97- runner->executeCommand (" dnf makecache" );
98- runner->executeCommand (" dnf install –y kernel kernel-devel doca-extra" );
99- if (runner->executeCommand (" lsmod | grep mlx5_core" ) != 0 ) {
100- runner->executeCommand (" modprobe mlx_core" );
101- }
126+ runner->checkCommand (" bash -c $'rpm -vih $(find /tmp/DOCA*/ -name \' *.rpm\' -printf \' %T@ %p\n\' | sort -nk1 | tail -1 | awk \' {print $2}\' )" );
102127
128+ runner->checkCommand (" dnf makecache" );
129+ runner->checkCommand (" dnf -y install kernel kernel-devel doca-extra" );
130+ runner->checkCommand (" modprobe mlx_core" );
103131 }
104132 break ;
105133
0 commit comments