Skip to content

Commit 38bffdb

Browse files
authored
refactor(spurctld): remove hostname aliasing logic (#315)
The controller silently aliased agent hostnames to unmatched config entries, hiding misconfiguration. With label-based partition selectors and the agent's --hostname flag, this is no longer needed. Agents whose hostname doesn't match config now register under their real name.
1 parent 0779957 commit 38bffdb

1 file changed

Lines changed: 11 additions & 73 deletions

File tree

crates/spurctld/src/cluster.rs

Lines changed: 11 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ pub struct ClusterManager {
5252
reservations: RwLock<Vec<Reservation>>,
5353
steps: RwLock<HashMap<(JobId, u32), JobStep>>,
5454
license_pool: RwLock<HashMap<String, u64>>,
55-
hostname_aliases: RwLock<HashMap<String, String>>,
5655
raft: RwLock<Option<SpurRaft>>,
5756
accounting: RwLock<Option<AccountingNotifier>>,
5857
fairshare_cache: Arc<FairshareCache>,
@@ -75,7 +74,6 @@ impl ClusterManager {
7574
steps: RwLock::new(HashMap::new()),
7675
next_job_id: AtomicU32::new(1),
7776
license_pool: RwLock::new(license_pool),
78-
hostname_aliases: RwLock::new(HashMap::new()),
7977
raft: RwLock::new(None),
8078
accounting: RwLock::new(None),
8179
fairshare_cache,
@@ -757,97 +755,46 @@ impl ClusterManager {
757755
source: NodeSource,
758756
labels: HashMap<String, String>,
759757
) -> anyhow::Result<()> {
760-
// Normalize node name: if the agent's hostname doesn't match any config
761-
// entry, check if there's an unmatched config node it could be aliased to.
762-
// This handles single-node setups where config says "localhost" but the
763-
// agent registers with its real hostname.
764-
let effective_name = {
765-
let registered_nodes = self.nodes.read();
766-
let mut matches_config = false;
767-
for nc in &self.config.nodes {
768-
if let Ok(hosts) = spur_core::hostlist::expand(&nc.names) {
769-
if hosts.contains(&name) {
770-
matches_config = true;
771-
break;
772-
}
773-
}
774-
}
775-
if !matches_config {
776-
// Agent hostname doesn't match config — find an unmatched config node
777-
let mut candidate = None;
778-
for nc in &self.config.nodes {
779-
if let Ok(hosts) = spur_core::hostlist::expand(&nc.names) {
780-
for host in &hosts {
781-
if !registered_nodes.contains_key(host) {
782-
candidate = Some(host.clone());
783-
break;
784-
}
785-
}
786-
if candidate.is_some() {
787-
break;
788-
}
789-
}
790-
}
791-
if let Some(config_name) = candidate {
792-
info!(
793-
agent_hostname = %name,
794-
config_name = %config_name,
795-
"node hostname doesn't match config — using config name"
796-
);
797-
// Store the alias so heartbeats from this hostname find the right node
798-
drop(registered_nodes);
799-
self.hostname_aliases
800-
.write()
801-
.insert(name.clone(), config_name.clone());
802-
config_name
803-
} else {
804-
name.clone()
805-
}
806-
} else {
807-
name.clone()
808-
}
809-
};
810-
811758
let action = {
812759
let nodes = self.nodes.read();
813-
evaluate_registration(nodes.get(&effective_name), &resources)
760+
evaluate_registration(nodes.get(&name), &resources)
814761
};
815762

816763
match action {
817764
RegistrationAction::Skip => {
818-
debug!(node = %effective_name, "node unchanged, skipping");
819-
self.sync_node_labels(&effective_name, labels)?;
765+
debug!(node = %name, "node unchanged, skipping");
766+
self.sync_node_labels(&name, labels)?;
820767
}
821768
RegistrationAction::Update => {
822769
self.propose(WalOperation::NodeUpdate {
823-
name: effective_name.clone(),
770+
name: name.clone(),
824771
resources,
825772
address,
826773
port,
827774
wg_pubkey,
828775
version,
829776
})?;
830-
self.sync_node_labels(&effective_name, labels)?;
831-
if let Some(node) = self.nodes.write().get_mut(&effective_name) {
777+
self.sync_node_labels(&name, labels)?;
778+
if let Some(node) = self.nodes.write().get_mut(&name) {
832779
node.source = source;
833780
}
834-
info!(node = %effective_name, "node updated (resources changed)");
781+
info!(node = %name, "node updated (resources changed)");
835782
}
836783
RegistrationAction::Register => {
837784
self.propose(WalOperation::NodeRegister {
838-
name: effective_name.clone(),
785+
name: name.clone(),
839786
resources,
840787
address,
841788
port,
842789
wg_pubkey,
843790
version,
844791
labels,
845792
})?;
846-
if let Some(node) = self.nodes.write().get_mut(&effective_name) {
793+
if let Some(node) = self.nodes.write().get_mut(&name) {
847794
node.source = source;
848795
node.agent_start_time = Some(Utc::now());
849796
}
850-
info!(node = %effective_name, "node registered");
797+
info!(node = %name, "node registered");
851798
}
852799
}
853800
Ok(())
@@ -885,14 +832,8 @@ impl ClusterManager {
885832
/// State recovery is handled separately by `check_node_health`, which
886833
/// detects the fresh `last_heartbeat` and proposes a WAL-backed transition.
887834
pub fn update_heartbeat(&self, name: &str, cpu_load: u32, free_memory_mb: u64) -> bool {
888-
let effective_name = self
889-
.hostname_aliases
890-
.read()
891-
.get(name)
892-
.cloned()
893-
.unwrap_or_else(|| name.to_string());
894835
let mut nodes = self.nodes.write();
895-
if let Some(node) = nodes.get_mut(&effective_name) {
836+
if let Some(node) = nodes.get_mut(name) {
896837
node.cpu_load = cpu_load;
897838
node.free_memory_mb = free_memory_mb;
898839
node.last_heartbeat = Some(Utc::now());
@@ -2265,7 +2206,6 @@ struct ClusterSnapshot {
22652206
reservations: Vec<Reservation>,
22662207
steps: Vec<JobStep>,
22672208
license_pool: HashMap<String, u64>,
2268-
hostname_aliases: HashMap<String, String>,
22692209
}
22702210

22712211
impl ClusterManager {
@@ -2313,7 +2253,6 @@ impl StateMachineApply for ClusterManager {
23132253
reservations: self.reservations.read().clone(),
23142254
steps: self.steps.read().values().cloned().collect(),
23152255
license_pool: self.license_pool.read().clone(),
2316-
hostname_aliases: self.hostname_aliases.read().clone(),
23172256
};
23182257
serde_json::to_vec(&snap).map_err(Into::into)
23192258
}
@@ -2343,7 +2282,6 @@ impl StateMachineApply for ClusterManager {
23432282
}
23442283

23452284
*self.license_pool.write() = snap.license_pool;
2346-
*self.hostname_aliases.write() = snap.hostname_aliases;
23472285

23482286
self.next_job_id.store(next_id, Ordering::Relaxed);
23492287

0 commit comments

Comments
 (0)