Skip to content

Commit 2fef577

Browse files
committed
fix: wait for cancelled thread to finish before Metal cleanup to prevent command buffer assertion
1 parent 8351895 commit 2fef577

File tree

1 file changed

+17
-11
lines changed

1 file changed

+17
-11
lines changed

omlx/admin/oq_manager.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,15 @@ async def cancel_quantization(self, task_id: str) -> bool:
360360

361361
shutil.rmtree(output, ignore_errors=True)
362362

363-
# Clean up GPU state to prevent Metal errors on next task
363+
# Clean up GPU state to prevent Metal errors on next task.
364+
# asyncio.Task.cancel() doesn't stop the to_thread immediately —
365+
# the thread may still have in-flight Metal commands. Wait for the
366+
# thread to actually finish before touching Metal state.
367+
if active_task:
368+
try:
369+
await asyncio.wait_for(asyncio.shield(active_task), timeout=10.0)
370+
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
371+
pass
364372
if HAS_MLX:
365373
try:
366374
mx.synchronize()
@@ -412,17 +420,15 @@ async def _run_quantization(self, task_id: str) -> None:
412420
return
413421

414422
# Ensure GPU is clean before starting (previous task may have been cancelled)
415-
# Metal needs time to fully release command buffers after cancellation
423+
# Metal command buffers need full sync + cache clear after cancellation
416424
if HAS_MLX:
417-
try:
418-
mx.synchronize()
419-
except Exception:
420-
pass
421-
await asyncio.sleep(2.0)
422-
try:
423-
mx.clear_cache()
424-
except Exception:
425-
pass
425+
for _ in range(3):
426+
try:
427+
mx.synchronize()
428+
mx.clear_cache()
429+
break
430+
except Exception:
431+
await asyncio.sleep(1.0)
426432

427433
# Phase 1: Loading
428434
task.status = QuantStatus.LOADING

0 commit comments

Comments
 (0)