Skip to content
This repository was archived by the owner on Apr 16, 2026. It is now read-only.

Commit ea38360

Browse files
committed
Fix Confluent deployment
1 parent feb855b commit ea38360

9 files changed

Lines changed: 94 additions & 60 deletions

File tree

include/cloysterhpc/models/cluster.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class Cluster {
5252
* @enum Provisioner
5353
* @brief Enumeration for cluster provisioners.
5454
*/
55-
enum class Provisioner { xCAT };
55+
enum class Provisioner { xCAT, Confluent };
5656

5757
private:
5858
std::string m_name;

include/cloysterhpc/services/runner.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@ namespace unsafe {
3232
if (!opts->dryRun) {
3333
LOG_DEBUG("Running shell command: {}", command);
3434
boost::process::ipstream pipe_stream;
35-
boost::process::child child("/bin/bash", "-xc", command,
35+
// -x for debug
36+
// -l for loading /etc/profile.d/* files
37+
// -e for stopping in the first error, use (|| :) to ignore
38+
boost::process::child child("/bin/bash", "-xelc", command,
3639
boost::process::std_out > pipe_stream);
3740

3841
std::string line;
@@ -59,7 +62,10 @@ namespace unsafe {
5962
if (!opts->dryRun) {
6063
LOG_DEBUG("Running shell command: {}", command);
6164
boost::process::ipstream pipe_stream;
62-
boost::process::child child("/bin/bash", "-xc", command,
65+
// -x for debug
66+
// -l for loading /etc/profile.d/* files
67+
// -e for stopping in the first error, use (|| :) to ignore
68+
boost::process::child child("/bin/bash", "-xelc", command,
6369
boost::process::std_out > pipe_stream);
6470

6571
std::string line;

src/models/answerfile.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

6-
// TODO: CFL Retrieve the provisioner (xcat | confluent (default: confluent) from the answerfile
7-
86
#include <cstddef>
97
#include <fmt/core.h>
108
#include <iterator>

src/models/cluster.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,14 @@ void Cluster::fillData(const AnswerFile& answerfil)
766766

767767

768768
// TODO: CFL Retrieve the provisioner from the answerfile
769-
setProvisioner(Provisioner::xCAT);
769+
const auto provisioner = utils::string::lower(answerfil.system.provisioner);
770+
if (provisioner == "xcat") {
771+
setProvisioner(Provisioner::xCAT);
772+
} else if (provisioner == "confluent") {
773+
setProvisioner(Provisioner::Confluent);
774+
} else {
775+
cloyster::functions::abort("Invalid provisioner {}", provisioner);
776+
}
770777

771778
// FIXME: This should come from /etc/os-release
772779
m_headnode.setOS(nodeOS);

src/models/slurm.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ SLURM::SLURM(const Cluster& cluster)
1919

2020
void SLURM::installServer()
2121
{
22-
cloyster::Singleton<cloyster::services::IOSService>::get()->install(
22+
cloyster::Singleton<const cloyster::services::IOSService>::get()->install(
2323
"ohpc-slurm-server");
2424
}
2525

@@ -55,14 +55,14 @@ void SLURM::configureServer()
5555

5656
void SLURM::enableServer()
5757
{
58-
auto osservice = cloyster::Singleton<services::IOSService>::get();
58+
auto osservice = cloyster::Singleton<const services::IOSService>::get();
5959
osservice->enableService("munge");
6060
osservice->enableService("slurmctld");
6161
}
6262

6363
void SLURM::startServer()
6464
{
65-
auto osservice = cloyster::Singleton<services::IOSService>::get();
65+
auto osservice = cloyster::Singleton<const services::IOSService>::get();
6666
osservice->startService("munge");
6767
osservice->startService("slurmctld");
6868
}

src/network.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,6 @@ std::string Network::fetchDomainName()
368368
return ret;
369369
}
370370

371-
/* TODO: Check return type
372-
* - We can't return const (don't know exactly why)
373-
*/
374371
std::vector<address> Network::getNameservers() const
375372
{
376373
std::vector<address> returnVector;

src/services/ansible/roles/network.cpp

Lines changed: 58 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,45 @@ void disableNetworkManagerDNSOverride()
4242

4343
void configureNetworks(const std::list<Connection>& connections)
4444
{
45-
LOG_INFO("Setting up networks 2")
46-
4745
osservice()->enableService("NetworkManager");
46+
disableNetworkManagerDNSOverride();
47+
const auto shouldDisableIPV6 = []() -> bool {
48+
switch (cluster()->getProvisioner()) {
49+
case cloyster::models::Cluster::Provisioner::xCAT:
50+
return true;
51+
case cloyster::models::Cluster::Provisioner::Confluent:
52+
return false;
53+
default:
54+
// Unreachable
55+
cloyster::functions::abort("BUG: Invalid provisioner in network setup");
56+
}
57+
}();
4858

4959
for (const auto& connection : std::as_const(connections)) {
50-
LOG_INFO("Setting up networks ->> {}", connection.getNetwork()->getProfile())
5160
/* For now, we just skip the external network to avoid disconnects */
5261
if (connection.getNetwork()->getProfile() == Network::Profile::External) {
5362
continue;
5463
}
5564

56-
LOG_INFO("Setting up networks {}", connection.getNetwork()->getProfile())
65+
// These validations are sanity checks to improve error messages and
66+
// keep code future proof. The real validation should happen at
67+
// cluster.fillData method
68+
cloyster::functions::abortif(
69+
connection.getNetwork()->getProfile() == Network::Profile::Management
70+
&& connection.getNetwork()->getGateway().is_unspecified(),
71+
"Management network requires a gateway, please define a gateway in "
72+
"network_management section of the answerfile: {}",
73+
answerfile()->path());
74+
75+
if (connection.getNetwork()->getProfile() != Network::Profile::Management &&
76+
!connection.getNetwork()->getGateway().is_unspecified()) {
77+
LOG_WARN("Ignoring gateway in {} network {}, only Management network should specify a gateway",
78+
connection.getNetwork()->getGateway().to_string(),
79+
connection.getNetwork()->getProfile());
80+
}
81+
82+
const auto shouldUseGateway = connection.getNetwork()->getProfile()
83+
== Network::Profile::Management;
5784

5885
#ifndef NDEBUG
5986
if (!connection.getInterface().has_value()) {
@@ -65,72 +92,62 @@ void configureNetworks(const std::list<Connection>& connections)
6592

6693
std::vector<address> nameservers
6794
= connection.getNetwork()->getNameservers();
68-
LOG_INFO("Setting up networks {}", connection.getNetwork()->getProfile())
6995
std::vector<std::string> formattedNameservers;
70-
LOG_INFO("Setting up networks {}", connection.getNetwork()->getProfile())
7196
formattedNameservers.reserve(nameservers.size());
7297
for (const auto & nameserver : nameservers) {
7398
formattedNameservers.emplace_back(nameserver.to_string());
7499
}
75100

76101
LOG_INFO("Setting up networks {}", connection.getNetwork()->getProfile())
77-
auto opts = options();
78102
auto connectionName
79103
= cloyster::utils::enums::toString(connection.getNetwork()->getProfile());
80-
if (!opts->dryRun
81-
82-
&& runner()->executeCommand(
83-
fmt::format("nmcli connection show {}", connectionName))
84-
== 0) {
85-
LOG_WARN("Connection exists {}, skipping", connectionName);
86-
continue;
87-
}
88-
89-
LOG_INFO("Setting up networks {}", connection.getNetwork()->getProfile())
90-
91104
deleteConnectionIfExists(connectionName);
105+
92106
::runner()->executeCommand(
93107
fmt::format("nmcli device set {} managed yes", interface));
94108
::runner()->executeCommand(
95109
fmt::format("nmcli device set {} autoconnect yes", interface));
110+
::runner()->executeCommand(
111+
fmt::format("nmcli connection delete {}", connection.getNetwork()->getProfile()));
112+
113+
// example
114+
// nmcli connection delete Management; nmcli connection add con-name Management ifname enp2s2 type Ethernet mtu 1500 ipv4.method manual ipv4.address 192.168.30.254/24 ipv4.dns "192.168.122.1"; nmcli device connect enp2s2
96115
::runner()->executeCommand(fmt::format(
97-
"nmcli connection add con-name {} ifname {} type {} "
98-
"mtu {} ipv4.method manual ipv4.address {}/{} "
99-
"ipv4.dns \"{}\" "
100-
// "ipv4.gateway {} ipv4.dns \"{}\" "
101-
// @TODO: CFL only do this if we're using xCAT as provisioner
102-
// @FIXME: This will break Confluent, is it required by xCAT?
103-
"ipv4.dns-search {} ipv6.method disabled",
104-
cloyster::utils::enums::toString(
105-
connection.getNetwork()->getProfile()),
106-
interface,
107-
cloyster::utils::enums::toString(
108-
connection.getNetwork()->getType()),
109-
connection.getMTU(), connection.getAddress().to_string(),
110-
connection.getNetwork()->cidr.at(
111-
connection.getNetwork()->getSubnetMask().to_string()),
112-
// connection.getNetwork()->getGateway().to_string(),
113-
fmt::join(formattedNameservers, " "),
114-
connection.getNetwork()->getDomainName()));
116+
"nmcli connection add con-name {connName} ifname {ifname} type {type} "
117+
"mtu {mtu} ipv4.method manual ipv4.address {ip}/{cidr} "
118+
"ipv4.dns \"{dns}\" "
119+
// When I setup the gateway I lost the connection to the VM, so I'm commeting
120+
// it out for now
121+
// "{gw} "
122+
// "ipv4.dns-search {dnsSearch} "
123+
"{ipv6}",
124+
fmt::arg("connName", cloyster::utils::enums::toString(connection.getNetwork()->getProfile())),
125+
fmt::arg("ifname", interface),
126+
fmt::arg("type", cloyster::utils::enums::toString(connection.getNetwork()->getType())),
127+
fmt::arg("mtu", connection.getMTU()),
128+
fmt::arg("ip", connection.getAddress().to_string()),
129+
fmt::arg("cidr", connection.getNetwork()->cidr.at(connection.getNetwork()->getSubnetMask().to_string())),
130+
fmt::arg("dns", fmt::join(formattedNameservers, " ")),
131+
// fmt::arg("gw", shouldUseGateway
132+
// ? fmt::format("ipv4.gateway {}", connection.getNetwork()->getGateway().to_string())
133+
// : ""),
134+
// fmt::arg("dnsSearch", connection.getNetwork()->getDomainName()),
135+
fmt::arg("ipv6", shouldDisableIPV6 ? "ipv6.method disabled" : "")
136+
));
115137

116138

117-
LOG_INFO("Setting up networks {}", connection.getNetwork()->getProfile())
118139
/* Give network manage some time to settle thing up
119140
* Avoids: Error: Connection activation failed: IP configuration could
120141
* not be reserved (no available address, timeout, etc.).
121142
*/
122143
std::this_thread::sleep_for(std::chrono::milliseconds(200));
123144

124-
LOG_INFO("Setting up networks {}", connection.getNetwork()->getProfile())
125-
126145
// Breaking my ssh connection during development
127146
runner()->executeCommand(
128147
fmt::format("nmcli device connect {}", interface));
129148

130-
LOG_INFO("Setting up networks {} returning", connection.getNetwork()->getProfile())
131149
}
132150

133-
disableNetworkManagerDNSOverride();
134151

135152
}
136153

src/services/confluent.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@ nodeattrib {nodeName} net.ipv4_address={nodeIp}/{nodeCIDR}
3030
);
3131

3232
if (const auto& macOpt = node.getConnection(Network::Profile::Management).getMAC(); macOpt) {
33-
services::runner::shell::fmt(
34-
"nodeattrib {nodeName} net.hwaddr={nodeMac}",
33+
services::runner::shell::fmt("nodeattrib {nodeName} net.hwaddr={nodeMac}",
3534
fmt::arg("nodeName", node.getHostname()),
3635
fmt::arg("nodeMac", macOpt.value())
3736
);
@@ -64,13 +63,13 @@ void Confluent::install() {
6463
// NOTE: WIP - GENERALIZE THIS
6564
runner::shell::fmt(R"d(
6665
# Add the Confluent repository
67-
rpm -ivh https://hpc.lenovo.com/yum/latest/el{releasever}/{arch}/lenovo-hpc-yum-1-1.{arch}.rpm
66+
rpm -ivh https://hpc.lenovo.com/yum/latest/el{releasever}/{arch}/lenovo-hpc-yum-1-1.{arch}.rpm || :
6867
6968
# Install required packages
7069
# Technically only `lenovo-confluent` is requqired, however:
7170
# 1. If we are dealing with legacy systems we may need `tftp-server` also
7271
# 2. If we want DNS resolution, which we may want, also add `dnsmasq`.
73-
dnf install -y lenovo-confluent tftp-server dnsmasq
72+
dnf install -y lenovo-confluent tftp-server dnsmasq || :
7473
systemctl enable confluent --now
7574
systemctl enable httpd --now
7675
systemctl enable tftp.socket --now
@@ -100,7 +99,9 @@ nodegroupattrib everything \
10099
# nodegroupattrib everything -p bmcuser bmcpass crypted.rootpassword crypted.grubpassword
101100
102101
# Generate a keypair for internal cluster usage
103-
test -f ~/.ssh/id_ed25519 || ssh-keygen -t ed25519 -N ""
102+
# This command may issue prompts, the <<< n is to not replace
103+
# the old ssh key
104+
ssh-keygen -f ~/.ssh/id_ed25519 -t ed25519 -N "" <<< n
104105
105106
# Configure the osdeploy parameters; it's an interactive interface, so we must find a way to automate this step
106107
osdeploy initialize -u -s -k -l -p -a -t -g
@@ -123,12 +124,12 @@ imgutil pack /tmp/scratchdir/ {image}-diskless
123124
osdeploy list
124125
125126
# Remove the leftover files from the chroot
126-
rm -rf /tmp/scratchdir
127+
rm -rf /tmp/scratchdir || :
127128
128129
129130
)d",
130131

131-
fmt::arg("domain", cluster()->getHeadnode().getConnection(Network::Profile::Management).getFQDN()),
132+
fmt::arg("domain", cluster()->getDomainName()),
132133
fmt::arg("releasever", os().getMajorVersion()),
133134
fmt::arg("hnIp", cluster()->getHeadnode().getConnection(Network::Profile::Management).getAddress().to_string()),
134135
fmt::arg("arch", cloyster::utils::enums::toString(os().getArch())),

src/services/init.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@ namespace cloyster::services {
1313
using namespace cloyster;
1414
using namespace cloyster::services;
1515

16+
// WARNING: If you change the type T in Singleton<T>::init(...) (to const T for
17+
// instance) all the Singleton<T>::get need to be changed, otherwise you get
18+
// "Singleton read before initialization error" at runtime. While there are
19+
// getters to handle this in cloyster/utils/singletons in a uniform way and for
20+
// most cases, these getters depends on headers (that introduce the type
21+
// T in question), so files including the same header cannot use these getters
22+
// (or we have recursive header inclusion error).
23+
1624
// Singletons that depends only in the options, the cluster model
1725
// depends on these
1826
void initializeSingletonsOptions(std::unique_ptr<const Options>&& opts)

0 commit comments

Comments
 (0)