Skip to content

[Bug] V100 cannot perform full fine-tuning of BF16 models #4082

@lingyezhixing

Description

@lingyezhixing
from unsloth import is_bfloat16_supported, FastLanguageModel
from unsloth.chat_templates import get_chat_template, standardize_sharegpt
import torch
import sys

cache_dir = r"D:\LLM\Unsloth\SUB-Renamer\unsloth_compiled_cache"
sys.path.insert(0, cache_dir)

max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "E:/models/LLM/LFM2-350M",
    max_seq_length = max_seq_length,
    dtype = dtype,
    full_finetuning = True,
    trust_remote_code = True,
)

print(f"Model weights dtype: {model.dtype}")
print(f"Model config dtype: {model.config.torch_dtype}")
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
[E:\Programming\pycodes\miniconda3\envs\unsloth\Lib\site-packages\tqdm\auto.py:21](file:///E:/Programming/pycodes/miniconda3/envs/unsloth/Lib/site-packages/tqdm/auto.py#line=20): TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
W0220 09:11:01.289000 20796 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA.
Unsloth: WARNING `trust_remote_code` is True.
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2026.2.1: Fast Lfm2 patching. Transformers: 4.57.3.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 32.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.
`torch_dtype` is deprecated! Use `dtype` instead!
Model weights dtype: torch.float32
Model config dtype: torch.float32

Since V100 does not support BF16, it will automatically upsample to FP32, causing the logic to get stuck at this section in unsloth_compiled_cache/UnslothSFTTrainer.py, triggering "if not force_float32 and (not float16 and use_fp16)"

# unsloth_compiled_cache/UnslothSFTTrainer.py
...
if args is None: args = UnslothSFTConfig()
        use_bf16 = getattr(args, 'bf16', False)
        if type(use_bf16) is not bool: use_bf16 = False
        use_fp16 = getattr(args, 'fp16', False)
        if type(use_fp16) is not bool: use_fp16 = False
        force_float32 = False
        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
            print('Unsloth: Switching to float32 training since model cannot work with float16')
            force_float32 = True
        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
        if dtype is None: dtype = model.get_input_embeddings().weight.dtype
        from unsloth_zoo.utils import _get_dtype
        dtype = _get_dtype(dtype)
        float16 = dtype == torch.float16
        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
...
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        # warmup_steps = 2,
        # max_steps = 45,
        num_train_epochs = 3,
        warmup_ratio = 0.1,

        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc

        dataset_num_proc = 1,

        save_strategy="steps",      # 按步数保存
        save_steps=2000,            # 每 2000 步保存一次
    ),
)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[3], line 3
      1 from trl import SFTConfig, SFTTrainer
----> 3 trainer = SFTTrainer(
      4     model = model,
      5     tokenizer = tokenizer,
      6     train_dataset = dataset,
      7     dataset_text_field = "text",
      8     max_seq_length = max_seq_length,
      9     args = SFTConfig(
     10         per_device_train_batch_size = 2,
     11         gradient_accumulation_steps = 8,
     12 
     13         # Use num_train_epochs = 1, warmup_ratio for full training runs!
     14         # warmup_steps = 2,
     15         # max_steps = 45,
     16         num_train_epochs = 3,
     17         warmup_ratio = 0.1,
     18 
     19         learning_rate = 1e-5,
     20         fp16 = not is_bfloat16_supported(),
     21         bf16 = is_bfloat16_supported(),
     22         logging_steps = 1,
     23         optim = "adamw_8bit",
     24         weight_decay = 0.01,
     25         lr_scheduler_type = "cosine",
     26         seed = 3407,
     27         output_dir = "outputs",
     28         report_to = "none", # Use this for WandB etc
     29 
     30         dataset_num_proc = 1,
     31 
     32         save_strategy="steps",      # 按步数保存
     33         save_steps=2000,            # 每 2000 步保存一次
     34     ),
     35 )

File [E:\Programming\pycodes\miniconda3\envs\unsloth\Lib\site-packages\unsloth\trainer.py:408](file:///E:/Programming/pycodes/miniconda3/envs/unsloth/Lib/site-packages/unsloth/trainer.py#line=407), in _patch_sft_trainer_auto_packing.<locals>.new_init(self, *args, **kwargs)
    403         logger.info(
    404             "Unsloth: Padding-free batching auto-enabled for SFTTrainer instance."
    405         )
    407 try:
--> 408     original_init(self, *args, **kwargs)
    409 except ValueError as exc:
    410     if packing_active and _should_skip_auto_packing_error(exc):

File [E:\Programming\pycodes\miniconda3\envs\unsloth\Lib\site-packages\unsloth\trainer.py:314](file:///E:/Programming/pycodes/miniconda3/envs/unsloth/Lib/site-packages/unsloth/trainer.py#line=313), in _backwards_compatible_trainer.<locals>.new_init(self, *args, **kwargs)
    312     kwargs = trainer_kwargs
    313     kwargs["args"] = config
--> 314 original_init(self, *args, **kwargs)

File [D:\LLM\Unsloth\SUB-Renamer\unsloth_compiled_cache\UnslothSFTTrainer.py:1414](file:///D:/LLM/Unsloth/SUB-Renamer/unsloth_compiled_cache/UnslothSFTTrainer.py#line=1413), in UnslothSFTTrainer.__init__(self, model, args, data_collator, train_dataset, eval_dataset, processing_class, compute_loss_func, compute_metrics, callbacks, optimizer_cls_and_kwargs, preprocess_logits_for_metrics, peft_config, formatting_func, **kwargs)
   1412 float16 = dtype == torch.float16
   1413 if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
-> 1414 if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
   1415 if force_float32:
   1416     # Forced float32 training
   1417     args.fp16 = False

TypeError: Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`

In versions prior to December 2025 (I don't remember exactly which version after which it stopped working), I would bypass the check by setting "model.config.torch_dtype = torch.float16", but now this no longer works—hope it can be fixed soon

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions