Description
Is there an existing issue for this problem?
- I have searched the existing issues
Operating system
macOS
GPU vendor
Apple Silicon (MPS)
GPU model
M3
GPU VRAM
24
Version number
5.10.0
Browser
Safari 18.3.1
Python dependencies
{
"version": "5.10.0",
"dependencies": {
"accelerate" : "1.6.0" ,
"compel" : "2.0.2" ,
"cuda" : null ,
"diffusers" : "0.33.0" ,
"numpy" : "1.26.4" ,
"opencv" : "4.9.0.80",
"onnx" : "1.16.1" ,
"pillow" : "11.2.1" ,
"python" : "3.11.10" ,
"torch" : "2.6.0" ,
"torchvision" : "0.21.0" ,
"transformers": "4.51.3" ,
"xformers" : null
},
"config": {
"schema_version": "4.0.2",
"legacy_models_yaml_path": null,
"host": "127.0.0.1",
"port": 9090,
"allow_origins": [],
"allow_credentials": true,
"allow_methods": [""],
"allow_headers": [""],
"ssl_certfile": null,
"ssl_keyfile": null,
"log_tokenization": false,
"patchmatch": true,
"models_dir": "models",
"convert_cache_dir": "models/.convert_cache",
"download_cache_dir": "models/.download_cache",
"legacy_conf_dir": "configs",
"db_dir": "databases",
"outputs_dir": "/Users/davidburnett/invokeai/outputs",
"custom_nodes_dir": "nodes",
"style_presets_dir": "style_presets",
"workflow_thumbnails_dir": "workflow_thumbnails",
"log_handlers": ["console"],
"log_format": "color",
"log_level": "info",
"log_sql": false,
"log_level_network": "warning",
"use_memory_db": false,
"dev_reload": false,
"profile_graphs": false,
"profile_prefix": null,
"profiles_dir": "profiles",
"max_cache_ram_gb": null,
"max_cache_vram_gb": null,
"log_memory_usage": false,
"device_working_mem_gb": 3,
"enable_partial_loading": false,
"keep_ram_copy_of_weights": false,
"ram": null,
"vram": null,
"lazy_offload": true,
"pytorch_cuda_alloc_conf": null,
"device": "mps",
"precision": "bfloat16",
"sequential_guidance": false,
"attention_type": "torch-sdp",
"attention_slice_size": 1,
"force_tiled_decode": false,
"pil_compress_level": 1,
"max_queue_size": 10000,
"clear_queue_on_startup": false,
"allow_nodes": null,
"deny_nodes": null,
"node_cache_size": 512,
"hashing_algorithm": "blake3_single",
"remote_api_tokens": null,
"scan_models_on_startup": false
},
"set_config_fields": [
"precision" , "outputs_dir" , "keep_ram_copy_of_weights", "attention_type" ,
"attention_slice_size" , "legacy_models_yaml_path" , "device"
]
}
What happened
Running a simple Linear UI Flux render using a GGUF based model now fails with
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/_ops.py", line 723, in __call__
return self._op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Tensor for argument weight is on cpu but expected on mps
I've try multiple GGUF based models and they'v all failed with the same error, an OG non-quantised Flux model works fine.
The Full backtrace is...
[2025-04-18 11:56:59,089]::[InvokeAI]::ERROR --> Error while invoking session ead91b2d-b83d-4fef-a13d-bf9bf9923340, invocation 4c89d2a6-b5e1-466d-95bc-cf2c3aa14333 (flux_denoise): Tensor for argument weight is on cpu but expected on mps
[2025-04-18 11:56:59,089]::[InvokeAI]::ERROR --> Traceback (most recent call last):
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/app/services/session_processor/session_processor_default.py", line 129, in run_node
output = invocation.invoke_internal(context=context, services=self._services)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/app/invocations/baseinvocation.py", line 212, in invoke_internal
output = self.invoke(context)
^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/app/invocations/flux_denoise.py", line 155, in invoke
latents = self._run_diffusion(context)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/app/invocations/flux_denoise.py", line 379, in _run_diffusion
x = denoise(
^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/flux/denoise.py", line 75, in denoise
pred = model(
^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/flux/model.py", line 110, in forward
img = self.img_in(img)
^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_linear.py", line 84, in forward
return super().forward(input)
^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 125, in forward
return F.linear(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/quantization/gguf/ggml_tensor.py", line 187, in __torch_dispatch__
return GGML_TENSOR_OP_TABLE[func](func, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/quantization/gguf/ggml_tensor.py", line 37, in dequantize_and_run_debug
return func(*dequantized_args, **dequantized_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/_ops.py", line 723, in __call__
return self._op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Tensor for argument weight is on cpu but expected on mps
I've created a debug version of dequantize_and_run
which suggest the weight and bias are on the CPU device after dequantize.
tensor([[[-0.0454, 1.5703, 0.4180, ..., -0.7031, 0.2451, 2.5156],
[ 0.5703, 0.5234, 0.4609, ..., -0.7969, 0.1670, -1.1016],
[ 0.4414, -0.2070, -0.1963, ..., -0.6367, -2.0938, -0.9922],
...,
[-0.2500, 0.3066, 0.0148, ..., -0.1113, 0.7812, -0.3320],
[ 1.6719, 1.1016, 0.0967, ..., 1.0781, 0.2119, -0.0154],
[-0.3008, -0.4980, 0.7500, ..., 0.2148, -0.4492, -0.9922]]],
device='mps:0', dtype=torch.bfloat16)
---------------------------------------
tensor([[-0.0280, 0.0266, -0.0262, ..., 0.0250, -0.0146, -0.0339],
[-0.0029, -0.0022, -0.0571, ..., -0.0233, 0.0320, 0.0762],
[-0.0317, -0.0228, 0.0294, ..., 0.0176, -0.0413, 0.0415],
...,
[ 0.0291, -0.0141, -0.0147, ..., -0.0237, 0.0273, 0.0167],
[-0.0153, 0.0361, 0.0374, ..., 0.0039, -0.0464, 0.0461],
[-0.0737, 0.1211, -0.1138, ..., 0.0767, -0.0947, -0.0762]],
dtype=torch.bfloat16)
---------------------------------------
tensor([ 0.0081, 0.0062, 0.0003, ..., -0.0205, 0.0298, -0.0289],
dtype=torch.bfloat16)
---------------------------------------
and are on the CPU before quantisation too
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
tensor([[[-0.0454, 1.5703, 0.4180, ..., -0.7031, 0.2451, 2.5156],
[ 0.5703, 0.5234, 0.4609, ..., -0.7969, 0.1670, -1.1016],
[ 0.4414, -0.2070, -0.1963, ..., -0.6367, -2.0938, -0.9922],
...,
[-0.2500, 0.3066, 0.0148, ..., -0.1113, 0.7812, -0.3320],
[ 1.6719, 1.1016, 0.0967, ..., 1.0781, 0.2119, -0.0154],
[-0.3008, -0.4980, 0.7500, ..., 0.2148, -0.4492, -0.9922]]],
device='mps:0', dtype=torch.bfloat16)
---------------------------------------
GGMLTensor(type=F32, dequantized_shape=(torch.Size([3072, 64]))
tensor([[-0.0280, 0.0266, -0.0262, ..., 0.0250, -0.0146, -0.0339],
[-0.0029, -0.0022, -0.0571, ..., -0.0233, 0.0320, 0.0762],
[-0.0317, -0.0228, 0.0294, ..., 0.0176, -0.0413, 0.0415],
...,
[ 0.0291, -0.0141, -0.0147, ..., -0.0237, 0.0273, 0.0167],
[-0.0153, 0.0361, 0.0374, ..., 0.0039, -0.0464, 0.0461],
[-0.0737, 0.1211, -0.1138, ..., 0.0767, -0.0947, -0.0762]])
---------------------------------------
GGMLTensor(type=F32, dequantized_shape=(torch.Size([3072]))
tensor([ 0.0081, 0.0062, 0.0003, ..., -0.0205, 0.0298, -0.0289])
---------------------------------------
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
What you expected to happen
I expected the GGUF models to work and produce an image
How to reproduce the problem
Attempt to generate an image using a GGUF quantised model, even a simple Linear UI render with no control models or LoRA's.
Additional context
No response
Discord username
Vargol