Traceback (most recent call last):
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/queueing.py", line 867, in process_events
response = await route_utils.call_process_api(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...<5 lines>...
)
^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/route_utils.py", line 386, in call_process_api
output = await app.get_blocks().process_api(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...<11 lines>...
)
^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/blocks.py", line 2280, in process_api
result = await self.call_function(
^^^^^^^^^^^^^^^^^^^^^^^^^
...<8 lines>...
)
^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/blocks.py", line 1669, in call_function
prediction = await utils.async_iteration(iterator)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/utils.py", line 886, in async_iteration
return await anext(iterator)
^^^^^^^^^^^^^^^^^^^^^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/utils.py", line 1011, in asyncgen_wrapper
response = await iterator.__anext__()
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/chat_interface.py", line 537, in _wrapper
async for chunk in submit_fn(*args, **kwargs):
yield chunk
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/chat_interface.py", line 965, in _stream_fn
first_response = await utils.async_iteration(generator)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/utils.py", line 886, in async_iteration
return await anext(iterator)
^^^^^^^^^^^^^^^^^^^^^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/utils.py", line 868, in __anext__
return await anyio.to_thread.run_sync(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
run_sync_iterator_async, self.iterator, limiter=self.limiter
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/anyio/to_thread.py", line 63, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
func, args, abandon_on_cancel=abandon_on_cancel, limiter=limiter
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/anyio/_backends/_asyncio.py", line 2518, in run_sync_in_worker_thread
return await future
^^^^^^^^^^^^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/anyio/_backends/_asyncio.py", line 1002, in run
result = context.run(func, *args)
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/gradio/utils.py", line 851, in run_sync_iterator_async
return next(iterator)
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/mlx_vlm/chat_ui.py", line 310, in chat
for chunk in stream_generate(
~~~~~~~~~~~~~~~^
state.model,
^^^^^^^^^^^^
...<3 lines>...
**gen_kwargs,
^^^^^^^^^^^^^
):
^
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/mlx_vlm/generate/dispatch.py", line 930, in stream_generate
yield from stream_diffusion_generate_from_kwargs(
...<8 lines>...
)
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/mlx_vlm/generate/diffusion.py", line 1124, in stream_diffusion_generate_from_kwargs
yield from stream_diffusion_generate(
...<22 lines>...
)
File "/Users/ek/projects/mlx-lm-test/pyenv/lib/python3.13/site-packages/mlx_vlm/generate/diffusion.py", line 800, in stream_diffusion_generate
mx.eval([c.state for c in kv_cache])
~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: There is no Stream(gpu, 1) in current thread.
Package Version
------------------ -----------
aiohappyeyeballs 2.6.2
aiohttp 3.14.1
aiosignal 1.4.0
annotated-doc 0.0.4
annotated-types 0.7.0
anyio 4.13.0
attrs 26.1.0
audioop-lts 0.2.2
brotli 1.2.0
certifi 2026.5.20
cffi 2.0.0
charset-normalizer 3.4.7
click 8.4.1
datasets 5.0.0
dill 0.4.1
fastapi 0.136.3
filelock 3.29.3
frozenlist 1.8.0
fsspec 2026.4.0
gradio 6.18.0
gradio_client 2.5.0
groovy 0.1.2
h11 0.16.0
hf-gradio 0.4.1
hf-xet 1.5.1
httpcore 1.0.9
httpx 0.28.1
huggingface_hub 1.19.0
idna 3.18
Jinja2 3.1.6
llguidance 1.7.6
markdown-it-py 4.2.0
MarkupSafe 3.0.3
mdurl 0.1.2
miniaudio 1.71
mlx 0.31.2
mlx-audio 0.4.4
mlx-lm 0.31.3
mlx-metal 0.31.2
mlx-vlm 0.6.3
multidict 6.7.1
multiprocess 0.70.19
numpy 2.4.6
opencv-python 4.13.0.92
orjson 3.11.9
packaging 26.2
pandas 3.0.3
pillow 12.2.0
pip 25.1.1
propcache 0.5.2
protobuf 7.35.1
pyarrow 24.0.0
pycparser 3.0
pydantic 2.13.4
pydantic_core 2.46.4
pydub 0.25.1
Pygments 2.20.0
python-dateutil 2.9.0.post0
python-multipart 0.0.32
pytz 2026.2
PyYAML 6.0.3
regex 2026.5.9
requests 2.34.2
rich 15.0.0
safehttpx 0.1.7
safetensors 0.8.0
scipy 1.17.1
semantic-version 2.10.0
sentencepiece 0.2.1
shellingham 1.5.4
six 1.17.0
sounddevice 0.5.5
starlette 1.3.0
tokenizers 0.22.2
tomlkit 0.14.0
tqdm 4.68.2
transformers 5.11.0
typer 0.25.1
typing_extensions 4.15.0
typing-inspection 0.4.2
urllib3 2.7.0
uvicorn 0.49.0
xxhash 3.7.0
yarl 1.24.2
Running the following model on M1:
python -m mlx_vlm.chat_ui --model mlx-community/diffusiongemma-26B-A4B-it-4bitthen typing and submitting some prompt in the UI raises the following error:
Versions:
Python 3.13.3Quick and dirty workaround:
./mlx_vlm/chat_ui.pyaddmx.set_default_device(mx.cpu)afterimport mlx.core as mx./mlx_vlm/generate/common.pyreplacemax_rec_size = mx.device_info()["max_recommended_working_set_size"]withmax_rec_size = mx.device_info().get("max_recommended_working_set_size", 0)