Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions dream-server/extensions/services/dashboard-api/agent_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def to_dict(self) -> dict:
"tokens_per_second": round(self.tokens_per_second, 2),
"error_rate_1h": round(self.error_rate_1h, 2),
"queue_depth": self.queue_depth,
"last_update": self.last_update.isoformat()
"last_update": self.last_update.isoformat(),
}


Expand All @@ -43,9 +43,11 @@ async def refresh(self):
"""Query cluster status from smart proxy"""
try:
proc = await asyncio.create_subprocess_exec(
"curl", "-s", f"http://localhost:{os.environ.get('CLUSTER_PROXY_PORT', '9199')}/status",
"curl",
"-s",
f"http://localhost:{os.environ.get('CLUSTER_PROXY_PORT', '9199')}/status",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=5)

Expand All @@ -63,7 +65,7 @@ def to_dict(self) -> dict:
"nodes": self.nodes,
"total_gpus": self.total_gpus,
"active_gpus": self.active_gpus,
"failover_ready": self.failover_ready
"failover_ready": self.failover_ready,
}


Expand All @@ -76,16 +78,14 @@ def __init__(self, history_minutes: int = 15):

def add_sample(self, tokens_per_sec: float):
"""Add a new throughput sample"""
self.data_points.append({
"timestamp": datetime.now().isoformat(),
"tokens_per_sec": tokens_per_sec
})
self.data_points.append(
{"timestamp": datetime.now().isoformat(), "tokens_per_sec": tokens_per_sec}
)

# Prune old data
cutoff = datetime.now() - timedelta(minutes=self.history_minutes)
self.data_points = [
p for p in self.data_points
if datetime.fromisoformat(p["timestamp"]) > cutoff
p for p in self.data_points if datetime.fromisoformat(p["timestamp"]) > cutoff
]

def get_stats(self) -> dict:
Expand All @@ -98,7 +98,7 @@ def get_stats(self) -> dict:
"current": values[-1] if values else 0,
"average": sum(values) / len(values),
"peak": max(values) if values else 0,
"history": self.data_points[-30:] # Last 30 points
"history": self.data_points[-30:], # Last 30 points
}


Expand Down Expand Up @@ -129,5 +129,5 @@ def get_full_agent_metrics() -> dict:
"timestamp": datetime.now().isoformat(),
"agent": agent_metrics.to_dict(),
"cluster": cluster_status.to_dict(),
"throughput": throughput.get_stats()
"throughput": throughput.get_stats(),
}
42 changes: 29 additions & 13 deletions dream-server/extensions/services/dashboard-api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@
INSTALL_DIR = os.environ.get("DREAM_INSTALL_DIR", os.path.expanduser("~/dream-server"))
DATA_DIR = os.environ.get("DREAM_DATA_DIR", os.path.expanduser("~/.dream-server"))
EXTENSIONS_DIR = Path(
os.environ.get(
"DREAM_EXTENSIONS_DIR",
str(Path(INSTALL_DIR) / "extensions" / "services")
)
os.environ.get("DREAM_EXTENSIONS_DIR", str(Path(INSTALL_DIR) / "extensions" / "services"))
)

DEFAULT_SERVICE_HOST = os.environ.get("SERVICE_HOST", "host.docker.internal")
Expand All @@ -39,7 +36,9 @@ def _read_manifest_file(path: Path) -> dict[str, Any]:
return data


def load_extension_manifests(manifest_dir: Path, gpu_backend: str) -> tuple[dict[str, dict[str, Any]], list[dict[str, Any]]]:
def load_extension_manifests(
manifest_dir: Path, gpu_backend: str
) -> tuple[dict[str, dict[str, Any]], list[dict[str, Any]]]:
"""Load service and feature definitions from extension manifests."""
services: dict[str, dict[str, Any]] = {}
features: list[dict[str, Any]] = []
Expand Down Expand Up @@ -86,7 +85,11 @@ def load_extension_manifests(manifest_dir: Path, gpu_backend: str) -> tuple[dict

ext_port_env = service.get("external_port_env")
ext_port_default = service.get("external_port_default", service.get("port", 0))
external_port = int(os.environ.get(ext_port_env, str(ext_port_default))) if ext_port_env else int(ext_port_default)
external_port = (
int(os.environ.get(ext_port_env, str(ext_port_default)))
if ext_port_env
else int(ext_port_default)
)

services[service_id] = {
"host": host,
Expand All @@ -103,7 +106,11 @@ def load_extension_manifests(manifest_dir: Path, gpu_backend: str) -> tuple[dict
if not isinstance(feature, dict):
continue
supported = feature.get("gpu_backends", ["amd", "nvidia", "apple"])
if gpu_backend != "apple" and gpu_backend not in supported and "all" not in supported:
if (
gpu_backend != "apple"
and gpu_backend not in supported
and "all" not in supported
):
continue
if feature.get("id") and feature.get("name"):
features.append(feature)
Expand All @@ -112,7 +119,12 @@ def load_extension_manifests(manifest_dir: Path, gpu_backend: str) -> tuple[dict
except Exception as e:
logger.warning("Failed loading manifest %s: %s", path, e)

logger.info("Loaded %d extension manifests (%d services, %d features)", loaded, len(services), len(features))
logger.info(
"Loaded %d extension manifests (%d services, %d features)",
loaded,
len(services),
len(features),
)
return services, features


Expand All @@ -121,7 +133,9 @@ def load_extension_manifests(manifest_dir: Path, gpu_backend: str) -> tuple[dict
MANIFEST_SERVICES, MANIFEST_FEATURES = load_extension_manifests(EXTENSIONS_DIR, GPU_BACKEND)
SERVICES = MANIFEST_SERVICES
if not SERVICES:
logger.error("No services loaded from manifests in %s — dashboard will have no services", EXTENSIONS_DIR)
logger.error(
"No services loaded from manifests in %s — dashboard will have no services", EXTENSIONS_DIR
)

# --- Features ---

Expand All @@ -147,12 +161,14 @@ def resolve_workflow_dir() -> Path:
WORKFLOW_CATALOG_FILE = WORKFLOW_DIR / "catalog.json"
DEFAULT_WORKFLOW_CATALOG = {"workflows": [], "categories": {}}


def _default_n8n_url() -> str:
cfg = SERVICES.get("n8n", {})
host = cfg.get("host", "n8n")
port = cfg.get("port", 5678)
return f"http://{host}:{port}"


N8N_URL = os.environ.get("N8N_URL", _default_n8n_url())
N8N_API_KEY = os.environ.get("N8N_API_KEY", "")

Expand All @@ -164,18 +180,18 @@ def _default_n8n_url() -> str:
"general": {
"name": "General Helper",
"system_prompt": "You are a friendly and helpful AI assistant. You're knowledgeable, patient, and aim to be genuinely useful. Keep responses clear and conversational.",
"icon": "\U0001f4ac"
"icon": "\U0001f4ac",
},
"coding": {
"name": "Coding Buddy",
"system_prompt": "You are a skilled programmer and technical assistant. You write clean, well-documented code and explain technical concepts clearly. You're precise, thorough, and love solving problems.",
"icon": "\U0001f4bb"
"icon": "\U0001f4bb",
},
"creative": {
"name": "Creative Writer",
"system_prompt": "You are an imaginative creative writer and storyteller. You craft vivid descriptions, engaging narratives, and think outside the box. You're expressive and enjoy wordplay.",
"icon": "\U0001f3a8"
}
"icon": "\U0001f3a8",
},
}

# --- Sidebar Icons ---
Expand Down
41 changes: 27 additions & 14 deletions dream-server/extensions/services/dashboard-api/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def _read_sysfs(path: str) -> Optional[str]:
def _find_amd_gpu_sysfs() -> Optional[str]:
"""Find the sysfs base path for an AMD GPU device."""
import glob

for card_dir in sorted(glob.glob("/sys/class/drm/card*/device")):
vendor = _read_sysfs(f"{card_dir}/vendor")
if vendor == "0x1002":
Expand All @@ -44,6 +45,7 @@ def _find_amd_gpu_sysfs() -> Optional[str]:
def _find_hwmon_dir(device_path: str) -> Optional[str]:
"""Find the hwmon directory for an AMD GPU device."""
import glob

hwmon_dirs = sorted(glob.glob(f"{device_path}/hwmon/hwmon*"))
return hwmon_dirs[0] if hwmon_dirs else None

Expand Down Expand Up @@ -119,11 +121,13 @@ def get_gpu_info_nvidia() -> Optional[GPUInfo]:
Handles multi-GPU systems by summing VRAM across all GPUs and
reporting aggregate utilization and peak temperature.
"""
success, output = run_command([
"nvidia-smi",
"--query-gpu=name,memory.used,memory.total,utilization.gpu,temperature.gpu,power.draw",
"--format=csv,noheader,nounits"
])
success, output = run_command(
[
"nvidia-smi",
"--query-gpu=name,memory.used,memory.total,utilization.gpu,temperature.gpu,power.draw",
"--format=csv,noheader,nounits",
]
)

if not success or not output:
return None
Expand All @@ -140,19 +144,27 @@ def get_gpu_info_nvidia() -> Optional[GPUInfo]:
if len(parts) < 5:
continue
power_w = None
if len(parts) >= 6 and parts[5] not in ("[N/A]", "[Not Supported]", "N/A", "Not Supported", ""):
if len(parts) >= 6 and parts[5] not in (
"[N/A]",
"[Not Supported]",
"N/A",
"Not Supported",
"",
):
try:
power_w = round(float(parts[5]), 1)
except (ValueError, TypeError):
pass
gpus.append({
"name": parts[0],
"mem_used": int(parts[1]),
"mem_total": int(parts[2]),
"util": int(parts[3]),
"temp": int(parts[4]),
"power_w": power_w,
})
gpus.append(
{
"name": parts[0],
"mem_used": int(parts[1]),
"mem_total": int(parts[2]),
"util": int(parts[3]),
"temp": int(parts[4]),
"power_w": power_w,
}
)

if not gpus:
return None
Expand Down Expand Up @@ -229,6 +241,7 @@ def get_gpu_info_apple() -> Optional[GPUInfo]:
success, vm_output = run_command(["vm_stat"])
if success:
import re

pages = {}
for line in vm_output.splitlines():
match = re.match(r"(.+?):\s+(\d+)", line)
Expand Down
Loading
Loading