Skip to content

[bug]: GGUF models no longer work on MacOS, tensors on cpu not on mps #7939

Open
@Vargol

Description

@Vargol

Is there an existing issue for this problem?

  • I have searched the existing issues

Operating system

macOS

GPU vendor

Apple Silicon (MPS)

GPU model

M3

GPU VRAM

24

Version number

5.10.0

Browser

Safari 18.3.1

Python dependencies

{
"version": "5.10.0",
"dependencies": {
"accelerate" : "1.6.0" ,
"compel" : "2.0.2" ,
"cuda" : null ,
"diffusers" : "0.33.0" ,
"numpy" : "1.26.4" ,
"opencv" : "4.9.0.80",
"onnx" : "1.16.1" ,
"pillow" : "11.2.1" ,
"python" : "3.11.10" ,
"torch" : "2.6.0" ,
"torchvision" : "0.21.0" ,
"transformers": "4.51.3" ,
"xformers" : null
},
"config": {
"schema_version": "4.0.2",
"legacy_models_yaml_path": null,
"host": "127.0.0.1",
"port": 9090,
"allow_origins": [],
"allow_credentials": true,
"allow_methods": [""],
"allow_headers": ["
"],
"ssl_certfile": null,
"ssl_keyfile": null,
"log_tokenization": false,
"patchmatch": true,
"models_dir": "models",
"convert_cache_dir": "models/.convert_cache",
"download_cache_dir": "models/.download_cache",
"legacy_conf_dir": "configs",
"db_dir": "databases",
"outputs_dir": "/Users/davidburnett/invokeai/outputs",
"custom_nodes_dir": "nodes",
"style_presets_dir": "style_presets",
"workflow_thumbnails_dir": "workflow_thumbnails",
"log_handlers": ["console"],
"log_format": "color",
"log_level": "info",
"log_sql": false,
"log_level_network": "warning",
"use_memory_db": false,
"dev_reload": false,
"profile_graphs": false,
"profile_prefix": null,
"profiles_dir": "profiles",
"max_cache_ram_gb": null,
"max_cache_vram_gb": null,
"log_memory_usage": false,
"device_working_mem_gb": 3,
"enable_partial_loading": false,
"keep_ram_copy_of_weights": false,
"ram": null,
"vram": null,
"lazy_offload": true,
"pytorch_cuda_alloc_conf": null,
"device": "mps",
"precision": "bfloat16",
"sequential_guidance": false,
"attention_type": "torch-sdp",
"attention_slice_size": 1,
"force_tiled_decode": false,
"pil_compress_level": 1,
"max_queue_size": 10000,
"clear_queue_on_startup": false,
"allow_nodes": null,
"deny_nodes": null,
"node_cache_size": 512,
"hashing_algorithm": "blake3_single",
"remote_api_tokens": null,
"scan_models_on_startup": false
},
"set_config_fields": [
"precision" , "outputs_dir" , "keep_ram_copy_of_weights", "attention_type" ,
"attention_slice_size" , "legacy_models_yaml_path" , "device"
]
}

What happened

Running a simple Linear UI Flux render using a GGUF based model now fails with

  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/_ops.py", line 723, in __call__
    return self._op(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Tensor for argument weight is on cpu but expected on mps

I've try multiple GGUF based models and they'v all failed with the same error, an OG non-quantised Flux model works fine.

The Full backtrace is...

[2025-04-18 11:56:59,089]::[InvokeAI]::ERROR --> Error while invoking session ead91b2d-b83d-4fef-a13d-bf9bf9923340, invocation 4c89d2a6-b5e1-466d-95bc-cf2c3aa14333 (flux_denoise): Tensor for argument weight is on cpu but expected on mps
[2025-04-18 11:56:59,089]::[InvokeAI]::ERROR --> Traceback (most recent call last):
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/app/services/session_processor/session_processor_default.py", line 129, in run_node
    output = invocation.invoke_internal(context=context, services=self._services)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/app/invocations/baseinvocation.py", line 212, in invoke_internal
    output = self.invoke(context)
             ^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/app/invocations/flux_denoise.py", line 155, in invoke
    latents = self._run_diffusion(context)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/app/invocations/flux_denoise.py", line 379, in _run_diffusion
    x = denoise(
        ^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/flux/denoise.py", line 75, in denoise
    pred = model(
           ^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/flux/model.py", line 110, in forward
    img = self.img_in(img)
          ^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_linear.py", line 84, in forward
    return super().forward(input)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 125, in forward
    return F.linear(input, self.weight, self.bias)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/quantization/gguf/ggml_tensor.py", line 187, in __torch_dispatch__
    return GGML_TENSOR_OP_TABLE[func](func, args, kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/invokeai/backend/quantization/gguf/ggml_tensor.py", line 37, in dequantize_and_run_debug
    return func(*dequantized_args, **dequantized_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/SSD2TB/AI/InvokeAI/lib/python3.11/site-packages/torch/_ops.py", line 723, in __call__
    return self._op(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Tensor for argument weight is on cpu but expected on mps

I've created a debug version of dequantize_and_run which suggest the weight and bias are on the CPU device after dequantize.

tensor([[[-0.0454,  1.5703,  0.4180,  ..., -0.7031,  0.2451,  2.5156],
         [ 0.5703,  0.5234,  0.4609,  ..., -0.7969,  0.1670, -1.1016],
         [ 0.4414, -0.2070, -0.1963,  ..., -0.6367, -2.0938, -0.9922],
         ...,
         [-0.2500,  0.3066,  0.0148,  ..., -0.1113,  0.7812, -0.3320],
         [ 1.6719,  1.1016,  0.0967,  ...,  1.0781,  0.2119, -0.0154],
         [-0.3008, -0.4980,  0.7500,  ...,  0.2148, -0.4492, -0.9922]]],
       device='mps:0', dtype=torch.bfloat16)
---------------------------------------
tensor([[-0.0280,  0.0266, -0.0262,  ...,  0.0250, -0.0146, -0.0339],
        [-0.0029, -0.0022, -0.0571,  ..., -0.0233,  0.0320,  0.0762],
        [-0.0317, -0.0228,  0.0294,  ...,  0.0176, -0.0413,  0.0415],
        ...,
        [ 0.0291, -0.0141, -0.0147,  ..., -0.0237,  0.0273,  0.0167],
        [-0.0153,  0.0361,  0.0374,  ...,  0.0039, -0.0464,  0.0461],
        [-0.0737,  0.1211, -0.1138,  ...,  0.0767, -0.0947, -0.0762]],
       dtype=torch.bfloat16)
---------------------------------------
tensor([ 0.0081,  0.0062,  0.0003,  ..., -0.0205,  0.0298, -0.0289],
       dtype=torch.bfloat16)
---------------------------------------

and are on the CPU before quantisation too

AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
tensor([[[-0.0454,  1.5703,  0.4180,  ..., -0.7031,  0.2451,  2.5156],
         [ 0.5703,  0.5234,  0.4609,  ..., -0.7969,  0.1670, -1.1016],
         [ 0.4414, -0.2070, -0.1963,  ..., -0.6367, -2.0938, -0.9922],
         ...,
         [-0.2500,  0.3066,  0.0148,  ..., -0.1113,  0.7812, -0.3320],
         [ 1.6719,  1.1016,  0.0967,  ...,  1.0781,  0.2119, -0.0154],
         [-0.3008, -0.4980,  0.7500,  ...,  0.2148, -0.4492, -0.9922]]],
       device='mps:0', dtype=torch.bfloat16)
---------------------------------------
GGMLTensor(type=F32, dequantized_shape=(torch.Size([3072, 64]))
tensor([[-0.0280,  0.0266, -0.0262,  ...,  0.0250, -0.0146, -0.0339],
        [-0.0029, -0.0022, -0.0571,  ..., -0.0233,  0.0320,  0.0762],
        [-0.0317, -0.0228,  0.0294,  ...,  0.0176, -0.0413,  0.0415],
        ...,
        [ 0.0291, -0.0141, -0.0147,  ..., -0.0237,  0.0273,  0.0167],
        [-0.0153,  0.0361,  0.0374,  ...,  0.0039, -0.0464,  0.0461],
        [-0.0737,  0.1211, -0.1138,  ...,  0.0767, -0.0947, -0.0762]])
---------------------------------------
GGMLTensor(type=F32, dequantized_shape=(torch.Size([3072]))
tensor([ 0.0081,  0.0062,  0.0003,  ..., -0.0205,  0.0298, -0.0289])
---------------------------------------
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

What you expected to happen

I expected the GGUF models to work and produce an image

How to reproduce the problem

Attempt to generate an image using a GGUF quantised model, even a simple Linear UI render with no control models or LoRA's.

Additional context

No response

Discord username

Vargol

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions