Skip to content

Commit f811381

Browse files
feat(control-plane): port production Agent Control Plane from internal repo (#67)
New modules (8): - agent_hibernation.py (351L): Serverless agent hibernation with state serialization - constraint_graphs.py (453L): DAG-based policy constraint resolution - flight_recorder.py (626L): SQLite-backed audit trail with buffered writes - ml_safety.py (542L): Jailbreak detection and anomaly detection suite - multimodal.py (691L): Vision, audio, and RAG pipeline capabilities - shadow_mode.py (290L): Shadow execution for safe policy testing - supervisor_agents.py (403L): Supervisor pattern for agent-of-agents - time_travel_debugger.py (539L): Replay-based debugging with event sourcing Upgraded modules (5): - control_plane.py (+614L): Enhanced agent lifecycle, advanced routing - kernel_space.py (+504L): Protection rings, syscall interception, kernel metrics - policy_engine.py (+357L): Conditional permissions, risk policies, resource quotas - governance_layer.py (+261L): Bias detection, privacy analysis, alignment rules - mute_agent.py (+175L): Enhanced mute patterns with governance integration - __init__.py: Re-exports for all new modules 199 tests pass (+39 new), 0 regressions from pre-existing baseline. Closes #60 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 9417a44 commit f811381

File tree

14 files changed

+6677
-432
lines changed

14 files changed

+6677
-432
lines changed

packages/agent-os/modules/control-plane/src/agent_control_plane/__init__.py

Lines changed: 176 additions & 9 deletions
Large diffs are not rendered by default.
Lines changed: 367 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,367 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""
5+
Agent Hibernation - State-to-Disk Serialization
6+
7+
Feature: "Agent Hibernation (state-to-disk)"
8+
Problem: Agents sitting idle in memory cost money (if hosted) or RAM.
9+
Solution: Serialize the entire agent state (including caas pointer) to a JSON/Pickle file
10+
and kill the process. Wake it up only when amb receives a message for it.
11+
Result: Scale by Subtraction - Removes the need for "always-on" servers. Serverless Agents.
12+
13+
This module provides the infrastructure to hibernate idle agents by serializing
14+
their complete state to disk and restoring them when needed.
15+
"""
16+
17+
from typing import Any, Dict, Optional, List
18+
from dataclasses import dataclass, field, asdict
19+
from datetime import datetime, timedelta
20+
from enum import Enum
21+
import json
22+
import pickle
23+
import os
24+
import logging
25+
from pathlib import Path
26+
27+
28+
class HibernationFormat(Enum):
29+
"""Format for serializing agent state"""
30+
JSON = "json"
31+
PICKLE = "pickle"
32+
33+
34+
class AgentState(Enum):
35+
"""State of an agent in the hibernation lifecycle"""
36+
ACTIVE = "active"
37+
HIBERNATING = "hibernating"
38+
HIBERNATED = "hibernated"
39+
WAKING = "waking"
40+
41+
42+
@dataclass
43+
class HibernatedAgentMetadata:
44+
"""Metadata about a hibernated agent"""
45+
agent_id: str
46+
session_id: str
47+
hibernated_at: datetime
48+
state_file_path: str
49+
format: HibernationFormat
50+
state_size_bytes: int
51+
last_activity: datetime
52+
context_pointer: Optional[str] = None # caas pointer
53+
metadata: Dict[str, Any] = field(default_factory=dict)
54+
55+
56+
@dataclass
57+
class HibernationConfig:
58+
"""Configuration for agent hibernation"""
59+
enabled: bool = True
60+
idle_timeout_seconds: int = 300 # 5 minutes default
61+
storage_path: str = "/tmp/agent_hibernation"
62+
format: HibernationFormat = HibernationFormat.JSON
63+
max_hibernated_agents: int = 1000
64+
auto_cleanup_days: int = 7 # Clean up hibernated agents after 7 days
65+
compress: bool = False
66+
67+
68+
class HibernationManager:
69+
"""
70+
Manages agent hibernation and wake-up operations.
71+
72+
This class handles:
73+
- Serialization of complete agent state to disk
74+
- Deserialization and restoration of agent state
75+
- Tracking of hibernated agents
76+
- Automatic cleanup of old hibernated states
77+
"""
78+
79+
def __init__(self, config: Optional[HibernationConfig] = None):
80+
"""
81+
Initialize the hibernation manager.
82+
83+
Args:
84+
config: Configuration for hibernation behavior
85+
"""
86+
self.config = config or HibernationConfig()
87+
self.logger = logging.getLogger("HibernationManager")
88+
89+
# Create storage directory if it doesn't exist
90+
Path(self.config.storage_path).mkdir(parents=True, exist_ok=True)
91+
92+
# Track hibernated agents
93+
self.hibernated_agents: Dict[str, HibernatedAgentMetadata] = {}
94+
95+
# Track agent activity for idle detection
96+
self.agent_activity: Dict[str, datetime] = {}
97+
98+
# Track agent states
99+
self.agent_states: Dict[str, AgentState] = {}
100+
101+
self.logger.info(f"HibernationManager initialized with storage at {self.config.storage_path}")
102+
103+
def serialize_agent_state(
104+
self,
105+
agent_id: str,
106+
agent_context: Any,
107+
caas_pointer: Optional[str] = None,
108+
additional_state: Optional[Dict[str, Any]] = None
109+
) -> Dict[str, Any]:
110+
"""
111+
Serialize agent state to a dictionary.
112+
113+
Args:
114+
agent_id: Unique identifier for the agent
115+
agent_context: AgentContext object containing agent session data
116+
caas_pointer: Optional pointer to context in caas (Context-as-a-Service)
117+
additional_state: Optional additional state to serialize
118+
119+
Returns:
120+
Dictionary containing serialized agent state
121+
"""
122+
state = {
123+
"agent_id": agent_id,
124+
"session_id": agent_context.session_id,
125+
"created_at": agent_context.created_at.isoformat(),
126+
"permissions": {
127+
str(k): v.value for k, v in agent_context.permissions.items()
128+
},
129+
"metadata": agent_context.metadata,
130+
"caas_pointer": caas_pointer,
131+
"hibernated_at": datetime.now().isoformat(),
132+
"additional_state": additional_state or {}
133+
}
134+
135+
return state
136+
137+
def deserialize_agent_state(self, state: Dict[str, Any]) -> Dict[str, Any]:
138+
"""
139+
Deserialize agent state from a dictionary.
140+
141+
Args:
142+
state: Dictionary containing serialized agent state
143+
144+
Returns:
145+
Dictionary with deserialized state components
146+
"""
147+
# Convert ISO format strings back to datetime
148+
if "created_at" in state:
149+
state["created_at"] = datetime.fromisoformat(state["created_at"])
150+
if "hibernated_at" in state:
151+
state["hibernated_at"] = datetime.fromisoformat(state["hibernated_at"])
152+
153+
return state
154+
155+
def hibernate_agent(
156+
self,
157+
agent_id: str,
158+
agent_context: Any,
159+
caas_pointer: Optional[str] = None,
160+
additional_state: Optional[Dict[str, Any]] = None
161+
) -> HibernatedAgentMetadata:
162+
"""
163+
Hibernate an agent by saving its state to disk.
164+
165+
Args:
166+
agent_id: Unique identifier for the agent
167+
agent_context: AgentContext object to serialize
168+
caas_pointer: Optional pointer to context in caas
169+
additional_state: Optional additional state to save
170+
171+
Returns:
172+
Metadata about the hibernated agent
173+
"""
174+
self.logger.info(f"Hibernating agent {agent_id}")
175+
176+
# Update agent state
177+
self.agent_states[agent_id] = AgentState.HIBERNATING
178+
179+
# Serialize agent state
180+
state = self.serialize_agent_state(agent_id, agent_context, caas_pointer, additional_state)
181+
182+
# Determine file path
183+
file_name = f"{agent_id}_{agent_context.session_id}.{self.config.format.value}"
184+
file_path = os.path.join(self.config.storage_path, file_name)
185+
186+
# Save to disk
187+
try:
188+
if self.config.format == HibernationFormat.JSON:
189+
with open(file_path, 'w') as f:
190+
json.dump(state, f, indent=2)
191+
else: # PICKLE
192+
with open(file_path, 'wb') as f:
193+
pickle.dump(state, f)
194+
195+
# Get file size
196+
state_size = os.path.getsize(file_path)
197+
198+
# Create metadata
199+
metadata = HibernatedAgentMetadata(
200+
agent_id=agent_id,
201+
session_id=agent_context.session_id,
202+
hibernated_at=datetime.now(),
203+
state_file_path=file_path,
204+
format=self.config.format,
205+
state_size_bytes=state_size,
206+
last_activity=self.agent_activity.get(agent_id, datetime.now()),
207+
context_pointer=caas_pointer
208+
)
209+
210+
# Track hibernated agent
211+
self.hibernated_agents[agent_id] = metadata
212+
self.agent_states[agent_id] = AgentState.HIBERNATED
213+
214+
self.logger.info(f"Agent {agent_id} hibernated successfully. State saved to {file_path} ({state_size} bytes)")
215+
216+
return metadata
217+
218+
except Exception as e:
219+
self.logger.error(f"Failed to hibernate agent {agent_id}: {e}")
220+
self.agent_states[agent_id] = AgentState.ACTIVE
221+
raise
222+
223+
def wake_agent(self, agent_id: str) -> Dict[str, Any]:
224+
"""
225+
Wake up a hibernated agent by restoring its state from disk.
226+
227+
Args:
228+
agent_id: Unique identifier for the agent to wake
229+
230+
Returns:
231+
Dictionary containing the restored agent state
232+
"""
233+
if agent_id not in self.hibernated_agents:
234+
raise ValueError(f"Agent {agent_id} is not hibernated")
235+
236+
self.logger.info(f"Waking agent {agent_id}")
237+
238+
# Update state
239+
self.agent_states[agent_id] = AgentState.WAKING
240+
241+
metadata = self.hibernated_agents[agent_id]
242+
243+
try:
244+
# Load state from disk
245+
if metadata.format == HibernationFormat.JSON:
246+
with open(metadata.state_file_path, 'r') as f:
247+
state = json.load(f)
248+
else: # PICKLE
249+
with open(metadata.state_file_path, 'rb') as f:
250+
state = pickle.load(f)
251+
252+
# Deserialize state
253+
restored_state = self.deserialize_agent_state(state)
254+
255+
# Update tracking
256+
self.agent_activity[agent_id] = datetime.now()
257+
self.agent_states[agent_id] = AgentState.ACTIVE
258+
259+
# Remove from hibernated agents
260+
del self.hibernated_agents[agent_id]
261+
262+
self.logger.info(f"Agent {agent_id} woken successfully")
263+
264+
return restored_state
265+
266+
except Exception as e:
267+
self.logger.error(f"Failed to wake agent {agent_id}: {e}")
268+
self.agent_states[agent_id] = AgentState.HIBERNATED
269+
raise
270+
271+
def is_agent_hibernated(self, agent_id: str) -> bool:
272+
"""Check if an agent is currently hibernated"""
273+
return agent_id in self.hibernated_agents
274+
275+
def get_hibernated_agents(self) -> List[HibernatedAgentMetadata]:
276+
"""Get list of all hibernated agents"""
277+
return list(self.hibernated_agents.values())
278+
279+
def record_agent_activity(self, agent_id: str):
280+
"""Record activity for an agent (resets idle timer)"""
281+
self.agent_activity[agent_id] = datetime.now()
282+
if agent_id not in self.agent_states or self.agent_states[agent_id] == AgentState.HIBERNATED:
283+
self.agent_states[agent_id] = AgentState.ACTIVE
284+
285+
def get_idle_agents(self, min_idle_seconds: Optional[int] = None) -> List[str]:
286+
"""
287+
Get list of agent IDs that have been idle for the specified duration.
288+
289+
Args:
290+
min_idle_seconds: Minimum idle time in seconds (uses config default if None)
291+
292+
Returns:
293+
List of agent IDs that are idle
294+
"""
295+
idle_threshold = min_idle_seconds or self.config.idle_timeout_seconds
296+
now = datetime.now()
297+
idle_agents = []
298+
299+
for agent_id, last_activity in self.agent_activity.items():
300+
# Skip already hibernated agents
301+
if self.is_agent_hibernated(agent_id):
302+
continue
303+
304+
idle_time = (now - last_activity).total_seconds()
305+
if idle_time >= idle_threshold:
306+
idle_agents.append(agent_id)
307+
308+
return idle_agents
309+
310+
def cleanup_old_hibernated_agents(self, max_age_days: Optional[int] = None):
311+
"""
312+
Clean up hibernated agents older than the specified age.
313+
314+
Args:
315+
max_age_days: Maximum age in days (uses config default if None)
316+
"""
317+
max_age = max_age_days or self.config.auto_cleanup_days
318+
now = datetime.now()
319+
cleanup_threshold = timedelta(days=max_age)
320+
321+
agents_to_cleanup = []
322+
for agent_id, metadata in self.hibernated_agents.items():
323+
age = now - metadata.hibernated_at
324+
if age > cleanup_threshold:
325+
agents_to_cleanup.append(agent_id)
326+
327+
for agent_id in agents_to_cleanup:
328+
self._cleanup_hibernated_agent(agent_id)
329+
330+
def _cleanup_hibernated_agent(self, agent_id: str):
331+
"""Remove hibernated agent and delete its state file"""
332+
if agent_id not in self.hibernated_agents:
333+
return
334+
335+
metadata = self.hibernated_agents[agent_id]
336+
337+
try:
338+
# Delete state file
339+
if os.path.exists(metadata.state_file_path):
340+
os.remove(metadata.state_file_path)
341+
342+
# Remove from tracking
343+
del self.hibernated_agents[agent_id]
344+
if agent_id in self.agent_states:
345+
del self.agent_states[agent_id]
346+
if agent_id in self.agent_activity:
347+
del self.agent_activity[agent_id]
348+
349+
self.logger.info(f"Cleaned up hibernated agent {agent_id}")
350+
351+
except Exception as e:
352+
self.logger.error(f"Failed to cleanup hibernated agent {agent_id}: {e}")
353+
354+
def get_statistics(self) -> Dict[str, Any]:
355+
"""Get statistics about hibernation"""
356+
total_hibernated = len(self.hibernated_agents)
357+
total_size = sum(m.state_size_bytes for m in self.hibernated_agents.values())
358+
359+
return {
360+
"total_hibernated_agents": total_hibernated,
361+
"total_state_size_bytes": total_size,
362+
"total_state_size_mb": total_size / (1024 * 1024),
363+
"active_agents": len([s for s in self.agent_states.values() if s == AgentState.ACTIVE]),
364+
"storage_path": self.config.storage_path,
365+
"format": self.config.format.value,
366+
"idle_timeout_seconds": self.config.idle_timeout_seconds
367+
}

0 commit comments

Comments
 (0)