Skip to content

Commit 79dff22

Browse files
committed
refactor(supervisor): rename vm_hash->vm_id, numeric vm_id->vm_index
Behaviour-neutral identifier rename so the supervisor objects say what they mean (design: docs/plans/2026-06-19-supervisor-vm-id-rename-design.md). Two axes: - numeric local VM id 'vm_id: int' -> 'vm_index' (MicroVM, VmExecution.vm_index property + create(), network/, hypervisors/, controllers, get_unique_vm_index). - string identity 'vm_hash' -> 'vm_id' on the supervisor objects (VmExecution, VmPool keys/params, LocalSupervisor) and agent reads of that attribute (run/custom_logs/tasks/migration). Left unchanged: the controller's own 'vm_hash: ItemHash' param, the serialized Configuration.vm_id(int)/vm_hash(str) fields and the '"vm_id"'/'"vm_hash"' recreate-dict string keys, the wire CreateVmSpec.vm_id/VmInfo.vm_id, and genuine ItemHash locals. Verified: mypy unchanged at baseline (43/13); full supervisor+migration+network suite green (3 env-only test_interfaces pyroute2/root failures excepted); import-linter 4 kept/0 broken; grep sweep clean (no execution.vm_hash).
1 parent 78536c3 commit 79dff22

34 files changed

Lines changed: 293 additions & 295 deletions

src/aleph/vm/agent/custom_logs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def filter(self, record):
2929
if not vm_hash:
3030
vm_execution: VmExecution | None = ctx_current_execution.get(None)
3131
if vm_execution:
32-
vm_hash = vm_execution.vm_hash
32+
vm_hash = vm_execution.vm_id
3333

3434
if not vm_hash:
3535
return False

src/aleph/vm/agent/tasks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ async def _handle_domains_aggregate(
257257
has_local_instance = any(
258258
execution.is_instance
259259
and execution.vm
260-
and (record := registry.get(execution.vm_hash)) is not None
260+
and (record := registry.get(execution.vm_id)) is not None
261261
and record.message.address == address
262262
for execution in pool.executions.values()
263263
)

src/aleph/vm/hypervisors/firecracker/microvm.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def supports_ipv6(self) -> bool:
7878

7979

8080
class MicroVM:
81-
vm_id: int
81+
vm_index: int
8282
use_jailer: bool
8383
firecracker_bin_path: Path
8484
jailer_bin_path: Path | None
@@ -96,15 +96,15 @@ class MicroVM:
9696
journal_stderr: BinaryIO | int | None = None
9797

9898
def __repr__(self):
99-
return f"<MicroVM {self.vm_id}>"
99+
return f"<MicroVM {self.vm_index}>"
100100

101101
def __str__(self):
102-
return f"vm-{self.vm_id}"
102+
return f"vm-{self.vm_index}"
103103

104104
@property
105105
def namespace_path(self) -> str:
106106
firecracker_bin_name = os.path.basename(self.firecracker_bin_path)
107-
return str(self.jailer_base_directory / firecracker_bin_name / str(self.vm_id))
107+
return str(self.jailer_base_directory / firecracker_bin_name / str(self.vm_index))
108108

109109
@property
110110
def jailer_path(self) -> str:
@@ -115,7 +115,7 @@ def socket_path(self) -> str:
115115
if self.use_jailer:
116116
return f"{self.jailer_path}/run/firecracker.socket"
117117
else:
118-
return f"/tmp/firecracker-{self.vm_id}.socket"
118+
return f"/tmp/firecracker-{self.vm_index}.socket"
119119

120120
@property
121121
def vsock_path(self) -> str:
@@ -126,7 +126,7 @@ def vsock_path(self) -> str:
126126

127127
def __init__(
128128
self,
129-
vm_id: int,
129+
vm_index: int,
130130
vm_hash: ItemHash,
131131
firecracker_bin_path: Path,
132132
jailer_base_directory: Path,
@@ -135,7 +135,7 @@ def __init__(
135135
init_timeout: float = 5.0,
136136
enable_log: bool = True,
137137
):
138-
self.vm_id = vm_id
138+
self.vm_index = vm_index
139139
self.vm_hash = vm_hash
140140
self.use_jailer = use_jailer
141141
self.jailer_base_directory = jailer_base_directory
@@ -263,7 +263,7 @@ async def start_jailed_firecracker(self, config_path: Path) -> asyncio.subproces
263263
options = (
264264
str(self.jailer_bin_path),
265265
"--id",
266-
str(self.vm_id),
266+
str(self.vm_index),
267267
"--exec-file",
268268
str(self.firecracker_bin_path),
269269
"--uid",
@@ -433,15 +433,15 @@ async def unix_client_connected(reader: asyncio.StreamReader, _writer: asyncio.S
433433
raise MicroVMFailedInitError() from error
434434

435435
async def shutdown(self) -> None:
436-
logger.debug(f"Shutdown vm={self.vm_id}")
436+
logger.debug(f"Shutdown vm={self.vm_index}")
437437
try:
438438
reader, writer = await asyncio.open_unix_connection(path=self.vsock_path)
439439
except (
440440
FileNotFoundError,
441441
ConnectionResetError,
442442
ConnectionRefusedError,
443443
) as error:
444-
logger.warning(f"VM={self.vm_id} cannot receive shutdown signal: {error.args}")
444+
logger.warning(f"VM={self.vm_index} cannot receive shutdown signal: {error.args}")
445445
return
446446

447447
try:
@@ -462,7 +462,7 @@ async def shutdown(self) -> None:
462462
if msg2 != b"STOPZ\n":
463463
logger.warning(f"Unexpected response from VM: {msg2[:20]!r}")
464464
except ConnectionResetError as error:
465-
logger.warning(f"ConnectionResetError in shutdown of {self.vm_id}: {error.args}")
465+
logger.warning(f"ConnectionResetError in shutdown of {self.vm_index}: {error.args}")
466466

467467
async def stop(self):
468468
if self.proc:
@@ -481,7 +481,7 @@ async def teardown(self):
481481
try:
482482
await asyncio.wait_for(self.shutdown(), timeout=5)
483483
except asyncio.TimeoutError:
484-
logger.exception(f"Timeout during VM shutdown vm={self.vm_id}")
484+
logger.exception(f"Timeout during VM shutdown vm={self.vm_index}")
485485
logger.debug("Waiting for one second for the process to shutdown")
486486
await asyncio.sleep(1)
487487
await self.stop()

src/aleph/vm/migration/helpers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,16 @@ async def graceful_shutdown(execution: VmExecution, timeout: int = GRACEFUL_SHUT
3030
client.system_powerdown()
3131
client.close()
3232
except Exception as e:
33-
logger.warning("Failed to send system_powerdown for %s: %s", execution.vm_hash, e)
33+
logger.warning("Failed to send system_powerdown for %s: %s", execution.vm_id, e)
3434

3535
start = time.monotonic()
3636
while time.monotonic() - start < timeout:
3737
if execution.systemd_manager and not execution.systemd_manager.is_service_active(execution.controller_service):
38-
logger.info("VM %s shut down gracefully", execution.vm_hash)
38+
logger.info("VM %s shut down gracefully", execution.vm_id)
3939
return
4040
await asyncio.sleep(1)
4141

42-
logger.warning("VM %s did not shut down within %ds, forcing stop", execution.vm_hash, timeout)
42+
logger.warning("VM %s did not shut down within %ds, forcing stop", execution.vm_id, timeout)
4343
if execution.systemd_manager:
4444
execution.systemd_manager.stop_and_disable(execution.controller_service)
4545

src/aleph/vm/models.py

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class VmExecution:
9595
"""
9696

9797
uuid: uuid.UUID # Unique identifier of this execution
98-
vm_hash: VmId
98+
vm_id: VmId
9999
# The message-free description this execution is built from.
100100
spec: CreateVmSpec
101101
resources: (
@@ -144,7 +144,7 @@ async def update_port_redirects(self, requested_ports: dict[int, dict[str, bool]
144144

145145
for protocol in SUPPORTED_PROTOCOL_FOR_REDIRECT:
146146
if target[protocol]:
147-
add_port_redirect_rule(self.vm.vm_id, interface, host_port, vm_port, protocol)
147+
add_port_redirect_rule(self.vm.vm_index, interface, host_port, vm_port, protocol)
148148
self.mapped_ports[int(vm_port)] = {"host": host_port, **target}
149149
changed = True
150150

@@ -155,7 +155,7 @@ async def update_port_redirects(self, requested_ports: dict[int, dict[str, bool]
155155
for protocol in SUPPORTED_PROTOCOL_FOR_REDIRECT:
156156
if current[protocol] != target[protocol]:
157157
if target[protocol]:
158-
add_port_redirect_rule(self.vm.vm_id, interface, host_port, vm_port, protocol)
158+
add_port_redirect_rule(self.vm.vm_index, interface, host_port, vm_port, protocol)
159159
else:
160160
remove_port_redirect_rule(interface, host_port, vm_port, protocol)
161161
changed = True
@@ -164,7 +164,7 @@ async def update_port_redirects(self, requested_ports: dict[int, dict[str, bool]
164164

165165
# Persist port mappings to dedicated table if anything changed
166166
if changed:
167-
await save_port_mappings(self.vm_hash, self.mapped_ports)
167+
await save_port_mappings(self.vm_id, self.mapped_ports)
168168

169169
async def recreate_port_redirect_rules(self) -> None:
170170
"""Recreate nftables port redirect rules from saved mapped_ports after restart.
@@ -215,15 +215,15 @@ async def recreate_port_redirect_rules(self) -> None:
215215
host_port,
216216
new_host_port,
217217
vm_port,
218-
self.vm_hash,
218+
self.vm_id,
219219
)
220220
host_port = new_host_port
221221
mapping["host"] = new_host_port
222222
port_changed = True
223223

224224
for protocol in protocols_to_create:
225225
all_entities += build_port_redirect_entities(
226-
self.vm.vm_id,
226+
self.vm.vm_index,
227227
interface,
228228
host_port,
229229
vm_port,
@@ -242,11 +242,11 @@ async def recreate_port_redirect_rules(self) -> None:
242242
protocol,
243243
host_port,
244244
vm_port,
245-
self.vm_hash,
245+
self.vm_id,
246246
)
247247

248248
if port_changed:
249-
await save_port_mappings(self.vm_hash, self.mapped_ports)
249+
await save_port_mappings(self.vm_id, self.mapped_ports)
250250

251251
async def removed_all_ports_redirection(self):
252252
if not self.vm:
@@ -323,12 +323,12 @@ def becomes_ready(self) -> Callable[[], Coroutine]:
323323
return self.ready_event.wait
324324

325325
@property
326-
def vm_id(self) -> int | None:
327-
return self.vm.vm_id if self.vm else None
326+
def vm_index(self) -> int | None:
327+
return self.vm.vm_index if self.vm else None
328328

329329
@property
330330
def controller_service(self) -> str:
331-
return f"aleph-vm-controller@{self.vm_hash}.service"
331+
return f"aleph-vm-controller@{self.vm_id}.service"
332332

333333
@property
334334
def allocated_memory_mib(self) -> int:
@@ -349,19 +349,19 @@ def has_resources(self) -> bool:
349349
return True
350350

351351
def __repr__(self):
352-
return f"<VMExecution {type(self.vm).__name__} {self.vm_hash} {self.times.started_at}>"
352+
return f"<VMExecution {type(self.vm).__name__} {self.vm_id} {self.times.started_at}>"
353353

354354
def __init__(
355355
self,
356-
vm_hash: VmId,
356+
vm_id: VmId,
357357
vm_spec: CreateVmSpec,
358358
snapshot_manager: SnapshotManager | None = None,
359359
systemd_manager: SystemDManager | None = None,
360360
persistent: bool = False,
361361
):
362362
self.init_task = None
363363
self.uuid = uuid.uuid1() # uuid1() includes the hardware address and timestamp
364-
self.vm_hash = vm_hash
364+
self.vm_id = vm_id
365365
self.spec = vm_spec
366366
self.times = VmExecutionTimes(defined_at=datetime.now(tz=timezone.utc))
367367
self.ready_event = asyncio.Event()
@@ -390,7 +390,7 @@ def from_spec(
390390
The supervisor's machinery (prepare/create/start) reads only the spec.
391391
"""
392392
return cls(
393-
vm_hash=spec.vm_id,
393+
vm_id=spec.vm_id,
394394
vm_spec=spec,
395395
snapshot_manager=snapshot_manager,
396396
systemd_manager=systemd_manager,
@@ -416,9 +416,9 @@ async def prepare(self) -> None:
416416
if self.spec.backend is Backend.FIRECRACKER:
417417
self.resources = SpecProgramResources.from_spec(self.spec)
418418
elif self.spec.tee is not None:
419-
self.resources = AlephQemuConfidentialResources.from_spec(self.spec, namespace=str(self.vm_hash))
419+
self.resources = AlephQemuConfidentialResources.from_spec(self.spec, namespace=str(self.vm_id))
420420
else:
421-
self.resources = AlephQemuResources.from_spec(self.spec, namespace=str(self.vm_hash))
421+
self.resources = AlephQemuResources.from_spec(self.spec, namespace=str(self.vm_id))
422422
self.times.prepared_at = datetime.now(tz=timezone.utc)
423423

424424
def uses_gpu(self, pci_host: str) -> bool:
@@ -429,7 +429,7 @@ def uses_gpu(self, pci_host: str) -> bool:
429429
return False
430430

431431
def create(
432-
self, vm_id: int, tap_interface: TapInterface | None = None, prepare: bool = True
432+
self, vm_index: int, tap_interface: TapInterface | None = None, prepare: bool = True
433433
) -> AlephVmControllerInterface:
434434
if not self.resources:
435435
msg = "Execution resources must be configured first"
@@ -439,8 +439,8 @@ def create(
439439
if self.spec.backend is Backend.FIRECRACKER:
440440
assert isinstance(self.resources, SpecProgramResources)
441441
self.vm = vm = SpecFirecrackerProgram(
442-
vm_id=vm_id,
443-
vm_hash=self.vm_hash,
442+
vm_index=vm_index,
443+
vm_hash=self.vm_id,
444444
spec=self.spec,
445445
resources=self.resources,
446446
tap_interface=tap_interface,
@@ -454,8 +454,8 @@ def create(
454454
# SAFETY-CRITICAL: never fall through to the plain AlephQemuInstance.
455455
assert isinstance(self.resources, AlephQemuConfidentialResources)
456456
self.vm = vm = AlephQemuConfidentialInstance(
457-
vm_id=vm_id,
458-
vm_hash=self.vm_hash,
457+
vm_index=vm_index,
458+
vm_hash=self.vm_id,
459459
resources=self.resources,
460460
enable_networking=self.spec.network.internet_access,
461461
confidential_policy=int(self.spec.tee.policy, 0),
@@ -465,8 +465,8 @@ def create(
465465
return vm
466466
assert isinstance(self.resources, AlephQemuResources)
467467
self.vm = vm = AlephQemuInstance(
468-
vm_id=vm_id,
469-
vm_hash=self.vm_hash,
468+
vm_index=vm_index,
469+
vm_hash=self.vm_id,
470470
resources=self.resources,
471471
enable_networking=self.spec.network.internet_access,
472472
hardware_resources=hardware_resources,
@@ -639,7 +639,7 @@ async def stop(self) -> None:
639639
# Prevent concurrent calls to stop() using a Lock
640640
async with self.stop_pending_lock:
641641
if self.times.stopped_at is not None:
642-
logger.debug(f"VM={self.vm.vm_id} already stopped")
642+
logger.debug(f"VM={self.vm.vm_index} already stopped")
643643
return
644644
if self.persistent and self.systemd_manager:
645645
self.systemd_manager.stop_and_disable(self.controller_service)
@@ -655,7 +655,7 @@ async def stop(self) -> None:
655655
self.times.stopped_at = datetime.now(tz=timezone.utc)
656656

657657
if self.vm.support_snapshot and self.snapshot_manager:
658-
await self.snapshot_manager.stop_for(self.vm_hash)
658+
await self.snapshot_manager.stop_for(self.vm_id)
659659
self.stop_event.set()
660660
logger.info("%s stopped", self)
661661

@@ -695,6 +695,6 @@ async def record_usage(self):
695695
await delete_record(execution_uuid=str(self.uuid))
696696
# Non-persistent VMs won't restart, so clean up their port mappings
697697
if not self.persistent:
698-
await delete_port_mappings(self.vm_hash)
698+
await delete_port_mappings(self.vm_id)
699699
if settings.EXECUTION_LOG_ENABLED:
700700
await save_execution_data(execution_uuid=self.uuid, execution_data=self.to_json())

0 commit comments

Comments
 (0)