Skip to content

Commit bee8a2e

Browse files
alyssapowellclaude
andcommitted
add AGX_RELAX_CDM_CTXSTORE_TIMEOUT driver workaround
Relaxes Metal command buffer context store timeout to reduce kernel panics on long-running GPU workloads. Zero-cost env var hint to the IOGPUFamily driver, set at import time. Suggested by @zcbenz (MLX maintainer) in ml-explore/mlx#3267. Surfaced by Harperbot/metal-guard. Bump to v1.0.1. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 67462ba commit bee8a2e

2 files changed

Lines changed: 14 additions & 2 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "mlx-halo"
7-
version = "1.0.0"
7+
version = "1.0.1"
88
description = "Pre-flight safety checks for MLX models on Apple Silicon. Prevents kernel panics from overlapping Metal GPU allocations."
99
license = {file = "LICENSE.md"}
1010
requires-python = ">=3.10"

src/mlx_halo/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,25 @@
2323
result = halo.check_all(estimated_model_gb=18.0)
2424
"""
2525

26+
import os as _os
27+
28+
# ── Apple GPU driver workaround ──────────────────────────────────────────
29+
# Relaxes the Metal command buffer context store timeout to reduce kernel
30+
# panics on long-running GPU workloads. Zero-cost env var hint to the
31+
# IOGPUFamily driver — safe to set unconditionally.
32+
#
33+
# Suggested by @zcbenz (MLX maintainer) in ml-explore/mlx#3267.
34+
# Surfaced by Harperbot/metal-guard (runtime MLX safety layer).
35+
if "AGX_RELAX_CDM_CTXSTORE_TIMEOUT" not in _os.environ:
36+
_os.environ["AGX_RELAX_CDM_CTXSTORE_TIMEOUT"] = "1"
37+
2638
from .safety import HaloCheck, preflight
2739
from .monitor import SystemMonitor, get_monitor
2840
from .pain import PainCalculator, get_pain_calculator, get_current_pain
2941
from .memory import get_gpu_memory_status, clear_gpu_cache, wait_for_memory_drain
3042
from .types import SystemMetrics, PainProfile, HaloResult, HealthStatus, MemoryStatus
3143

32-
__version__ = "1.0.0"
44+
__version__ = "1.0.1"
3345

3446
__all__ = [
3547
"preflight",

0 commit comments

Comments
 (0)