Skip to content

Commit 72ee5de

Browse files
committed
fix sandbox reap session
1 parent 48d6a32 commit 72ee5de

3 files changed

Lines changed: 104 additions & 20 deletions

File tree

src/agentscope_runtime/sandbox/manager/heartbeat_mixin.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,11 @@ def mark_session_recycled(
287287
if not model:
288288
continue
289289

290-
# if already released, don't flip back
291-
if model.state == ContainerState.RELEASED:
290+
# if already in terminal state, don't flip back
291+
if model.state in (
292+
ContainerState.RELEASED,
293+
ContainerState.REPLACED,
294+
):
292295
continue
293296

294297
model.state = ContainerState.RECYCLED
@@ -342,7 +345,8 @@ def needs_restore(self, session_ctx_id: str) -> bool:
342345
"""Check whether any container in the session is marked for restore.
343346
344347
A session is considered needing restore if any bound container is in
345-
``ContainerState.RECYCLED`` or has ``recycled_at`` set.
348+
``ContainerState.RECYCLED`` (not REPLACED, as REPLACED containers
349+
already have redirects set up).
346350
347351
Args:
348352
session_ctx_id (`str`):
@@ -360,10 +364,8 @@ def needs_restore(self, session_ctx_id: str) -> bool:
360364
model = self._load_container_model(cname)
361365
if not model:
362366
continue
363-
if (
364-
model.state == ContainerState.RECYCLED
365-
or model.recycled_at is not None
366-
):
367+
# Only RECYCLED needs restore; REPLACED already has redirect
368+
if model.state == ContainerState.RECYCLED:
367369
return True
368370
return False
369371

src/agentscope_runtime/sandbox/manager/sandbox_manager.py

Lines changed: 88 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@ def cleanup(self):
541541
if container_model.state in (
542542
ContainerState.RELEASED,
543543
ContainerState.RECYCLED,
544+
ContainerState.REPLACED,
544545
):
545546
continue
546547

@@ -567,6 +568,7 @@ def cleanup(self):
567568
if container_model.state in (
568569
ContainerState.RELEASED,
569570
ContainerState.RECYCLED,
571+
ContainerState.REPLACED,
570572
):
571573
continue
572574

@@ -1123,17 +1125,62 @@ async def get_status_async(self, *args, **kwargs):
11231125
return await asyncio.to_thread(self.get_status, *args, **kwargs)
11241126

11251127
@remote_wrapper()
1126-
def get_info(self, identity):
1127-
"""Get container information by container_name or container_id."""
1128+
def get_info(self, identity, _redirect_depth: int = 0):
1129+
"""
1130+
Get container information by container_name or container_id.
1131+
1132+
Automatically follows redirects if container is REPLACED.
1133+
1134+
Args:
1135+
identity: Container name or ID to look up
1136+
_redirect_depth: Internal parameter to prevent infinite redirect
1137+
loops
1138+
1139+
Returns:
1140+
Container information dict
1141+
1142+
Raises:
1143+
RuntimeError: If container not found or redirect loop detected
1144+
"""
1145+
# Prevent infinite redirect loops
1146+
if _redirect_depth > 10:
1147+
raise RuntimeError(
1148+
f"Redirect loop detected for container {identity}",
1149+
)
1150+
11281151
container_model = self.container_mapping.get(identity)
11291152
if container_model is None:
11301153
container_model = self.container_mapping.get(
11311154
self._generate_container_key(identity),
11321155
)
11331156
if container_model is None:
11341157
raise RuntimeError(f"No container found with id: {identity}.")
1158+
1159+
# Parse container model
1160+
if isinstance(container_model, dict):
1161+
cm_dict = container_model
1162+
elif hasattr(container_model, "model_dump"):
1163+
cm_dict = container_model.model_dump()
1164+
else:
1165+
cm_dict = container_model
1166+
1167+
# Check if this container has been replaced and needs redirect
1168+
cm = ContainerModel(**cm_dict)
1169+
if (
1170+
cm.state == ContainerState.REPLACED
1171+
and cm.redirect_to
1172+
and cm.redirect_to != identity
1173+
):
1174+
logger.debug(
1175+
f"Container {identity} is REPLACED, redirecting to "
1176+
f"{cm.redirect_to}",
1177+
)
1178+
# Follow the redirect recursively
1179+
return self.get_info(cm.redirect_to, _redirect_depth + 1)
1180+
1181+
# Return the container model as dict/json
11351182
if hasattr(container_model, "model_dump_json"):
1136-
container_model = container_model.model_dump_json()
1183+
return container_model.model_dump_json()
11371184

11381185
return container_model
11391186

@@ -1447,14 +1494,32 @@ def restore_session(self, session_ctx_id: str) -> None:
14471494
# 3) heartbeat after restore (session-level)
14481495
self.update_heartbeat(session_ctx_id)
14491496

1450-
# 4) archive old recycled records so needs_restore becomes False
1451-
for old_name in recycled_old_names:
1497+
# 4) mark old recycled records as REPLACED with redirect info
1498+
# (keep them for client compatibility, don't delete)
1499+
for i, old_name in enumerate(recycled_old_names):
14521500
try:
1453-
self.container_mapping.delete(old_name)
1501+
# Get the raw container data without following redirects
1502+
old_data = self.container_mapping.get(old_name)
1503+
if not old_data:
1504+
logger.warning(
1505+
f"restore_session: old container {old_name} not "
1506+
f"found in mapping",
1507+
)
1508+
continue
1509+
1510+
old_cm = ContainerModel(**old_data)
1511+
old_cm.state = ContainerState.REPLACED
1512+
old_cm.redirect_to = new_container_names[i]
1513+
old_cm.updated_at = time.time()
1514+
self.container_mapping.set(old_name, old_cm.model_dump())
1515+
logger.debug(
1516+
f"restore_session: marked {old_name} as REPLACED -> "
1517+
f"{new_container_names[i]}",
1518+
)
14541519
except Exception as e:
14551520
logger.warning(
1456-
f"restore_session: failed to delete old model"
1457-
f" {old_name}: {e}",
1521+
f"restore_session: failed to mark old container "
1522+
f"{old_name} as REPLACED: {e}",
14581523
)
14591524

14601525
def scan_heartbeat_once(self) -> dict:
@@ -1602,7 +1667,8 @@ def scan_pool_once(self) -> dict:
16021667

16031668
def scan_released_cleanup_once(self, max_delete: int = 200) -> dict:
16041669
"""
1605-
Delete container_mapping records whose state == RELEASED and expired.
1670+
Delete container_mapping records whose state is RELEASED or
1671+
REPLACED and expired.
16061672
16071673
TTL is config.released_key_ttl seconds. 0 disables cleanup.
16081674
"""
@@ -1635,17 +1701,26 @@ def scan_released_cleanup_once(self, max_delete: int = 200) -> dict:
16351701

16361702
cm = ContainerModel(**container_json)
16371703

1638-
if cm.state != ContainerState.RELEASED:
1704+
# Only cleanup RELEASED or REPLACED states
1705+
if cm.state not in (
1706+
ContainerState.RELEASED,
1707+
ContainerState.REPLACED,
1708+
):
16391709
result["skipped_not_released"] += 1
16401710
continue
16411711

1642-
released_at = cm.released_at or cm.updated_at or 0
1643-
if released_at <= 0:
1712+
# For RELEASED: use released_at; for REPLACED: use updated_at
1713+
cleanup_at = (
1714+
cm.released_at
1715+
if cm.state == ContainerState.RELEASED
1716+
else cm.updated_at
1717+
)
1718+
if not cleanup_at or cleanup_at <= 0:
16441719
# no timestamp -> treat as not expired
16451720
result["skipped_not_expired"] += 1
16461721
continue
16471722

1648-
if now - released_at <= ttl:
1723+
if now - cleanup_at <= ttl:
16491724
result["skipped_not_expired"] += 1
16501725
continue
16511726

src/agentscope_runtime/sandbox/model/container.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class ContainerState(str, Enum):
1111
WARM = "warm"
1212
RUNNING = "running"
1313
RECYCLED = "recycled"
14+
REPLACED = "replaced"
1415
ERROR = "error"
1516
RELEASED = "released"
1617

@@ -115,6 +116,12 @@ class ContainerModel(BaseModel):
115116
description="Reason for recycle",
116117
)
117118

119+
redirect_to: Optional[str] = Field(
120+
default=None,
121+
description="Container name to redirect to when this container is "
122+
"REPLACED",
123+
)
124+
118125
@model_validator(mode="after")
119126
def _compat_and_defaults(self):
120127
"""Compatibility layer for ContainerModel.

0 commit comments

Comments
 (0)