[Bug] V100 cannot perform full fine-tuning of BF16 models

```python
from unsloth import is_bfloat16_supported, FastLanguageModel
from unsloth.chat_templates import get_chat_template, standardize_sharegpt
import torch
import sys

cache_dir = r"D:\LLM\Unsloth\SUB-Renamer\unsloth_compiled_cache"
sys.path.insert(0, cache_dir)

max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "E:/models/LLM/LFM2-350M",
    max_seq_length = max_seq_length,
    dtype = dtype,
    full_finetuning = True,
    trust_remote_code = True,
)

print(f"Model weights dtype: {model.dtype}")
print(f"Model config dtype: {model.config.torch_dtype}")
```
```
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
[E:\Programming\pycodes\miniconda3\envs\unsloth\Lib\site-packages\tqdm\auto.py:21](file:///E:/Programming/pycodes/miniconda3/envs/unsloth/Lib/site-packages/tqdm/auto.py#line=20): TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
W0220 09:11:01.289000 20796 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA.
Unsloth: WARNING `trust_remote_code` is True.
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2026.2.1: Fast Lfm2 patching. Transformers: 4.57.3.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 32.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.
`torch_dtype` is deprecated! Use `dtype` instead!
Model weights dtype: torch.float32
Model config dtype: torch.float32
```
Since V100 does not support BF16, it will automatically upsample to FP32, causing the logic to get stuck at this section in unsloth_compiled_cache/UnslothSFTTrainer.py, triggering "if not force_float32 and (not float16 and use_fp16)"
```
# unsloth_compiled_cache/UnslothSFTTrainer.py
...
if args is None: args = UnslothSFTConfig()
        use_bf16 = getattr(args, 'bf16', False)
        if type(use_bf16) is not bool: use_bf16 = False
        use_fp16 = getattr(args, 'fp16', False)
        if type(use_fp16) is not bool: use_fp16 = False
        force_float32 = False
        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
            print('Unsloth: Switching to float32 training since model cannot work with float16')
            force_float32 = True
        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
        if dtype is None: dtype = model.get_input_embeddings().weight.dtype
        from unsloth_zoo.utils import _get_dtype
        dtype = _get_dtype(dtype)
        float16 = dtype == torch.float16
        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
...
```
```
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        # warmup_steps = 2,
        # max_steps = 45,
        num_train_epochs = 3,
        warmup_ratio = 0.1,

        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc

        dataset_num_proc = 1,

        save_strategy="steps",      # 按步数保存
        save_steps=2000,            # 每 2000 步保存一次
    ),
)
```
```
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[3], line 3
      1 from trl import SFTConfig, SFTTrainer
----> 3 trainer = SFTTrainer(
      4     model = model,
      5     tokenizer = tokenizer,
      6     train_dataset = dataset,
      7     dataset_text_field = "text",
      8     max_seq_length = max_seq_length,
      9     args = SFTConfig(
     10         per_device_train_batch_size = 2,
     11         gradient_accumulation_steps = 8,
     12 
     13         # Use num_train_epochs = 1, warmup_ratio for full training runs!
     14         # warmup_steps = 2,
     15         # max_steps = 45,
     16         num_train_epochs = 3,
     17         warmup_ratio = 0.1,
     18 
     19         learning_rate = 1e-5,
     20         fp16 = not is_bfloat16_supported(),
     21         bf16 = is_bfloat16_supported(),
     22         logging_steps = 1,
     23         optim = "adamw_8bit",
     24         weight_decay = 0.01,
     25         lr_scheduler_type = "cosine",
     26         seed = 3407,
     27         output_dir = "outputs",
     28         report_to = "none", # Use this for WandB etc
     29 
     30         dataset_num_proc = 1,
     31 
     32         save_strategy="steps",      # 按步数保存
     33         save_steps=2000,            # 每 2000 步保存一次
     34     ),
     35 )

File [E:\Programming\pycodes\miniconda3\envs\unsloth\Lib\site-packages\unsloth\trainer.py:408](file:///E:/Programming/pycodes/miniconda3/envs/unsloth/Lib/site-packages/unsloth/trainer.py#line=407), in _patch_sft_trainer_auto_packing.<locals>.new_init(self, *args, **kwargs)
    403         logger.info(
    404             "Unsloth: Padding-free batching auto-enabled for SFTTrainer instance."
    405         )
    407 try:
--> 408     original_init(self, *args, **kwargs)
    409 except ValueError as exc:
    410     if packing_active and _should_skip_auto_packing_error(exc):

File [E:\Programming\pycodes\miniconda3\envs\unsloth\Lib\site-packages\unsloth\trainer.py:314](file:///E:/Programming/pycodes/miniconda3/envs/unsloth/Lib/site-packages/unsloth/trainer.py#line=313), in _backwards_compatible_trainer.<locals>.new_init(self, *args, **kwargs)
    312     kwargs = trainer_kwargs
    313     kwargs["args"] = config
--> 314 original_init(self, *args, **kwargs)

File [D:\LLM\Unsloth\SUB-Renamer\unsloth_compiled_cache\UnslothSFTTrainer.py:1414](file:///D:/LLM/Unsloth/SUB-Renamer/unsloth_compiled_cache/UnslothSFTTrainer.py#line=1413), in UnslothSFTTrainer.__init__(self, model, args, data_collator, train_dataset, eval_dataset, processing_class, compute_loss_func, compute_metrics, callbacks, optimizer_cls_and_kwargs, preprocess_logits_for_metrics, peft_config, formatting_func, **kwargs)
   1412 float16 = dtype == torch.float16
   1413 if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
-> 1414 if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
   1415 if force_float32:
   1416     # Forced float32 training
   1417     args.fp16 = False

TypeError: Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`
```
In versions prior to December 2025 (I don't remember exactly which version after which it stopped working), I would bypass the check by setting "model.config.torch_dtype = torch.float16", but now this no longer works—hope it can be fixed soon

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Bug] V100 cannot perform full fine-tuning of BF16 models #4082

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

[Bug] V100 cannot perform full fine-tuning of BF16 models #4082

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions