-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Open
Labels
feature requestFeature request pending on roadmapFeature request pending on roadmap
Description
I Get A Lot Of Errors While Trying To FineTune Any Model Using FastModel Or FastVisionModel Instead Of FastLanguageModel While Using MultiGPU device_map="auto/balanced"
Request To Add MultiGPU Support For FastVisionModel And FastModel Just Like Language One.
Error :
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 151645, 'bos_token_id': None}.
==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 2
\\ /| Num examples = 262,751 | Num Epochs = 1 | Total steps = 32,844
O^O/ \_/ \ Batch size per device = 2 | Gradient accumulation steps = 4
\ / Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
"-____-" Trainable parameters = 34,865,152 of 2,162,397,184 (1.61% trained)
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
NotImplementedError: Cannot access storage of TensorWrapper
The above exception was the direct cause of the following exception:
Unsupported Traceback (most recent call last)
/tmp/ipykernel_1322/773422404.py in <cell line: 0>()
----> 1 trainer_stats = trainer.train()
/kaggle/working/unsloth_compiled_cache/UnslothSFTTrainer.py in wrapper(self, *args, **kwargs)
51 if hasattr(self, 'model') and hasattr(self.model, "for_training"):
52 self.model.for_training()
---> 53 output = f(self, *args, **kwargs)
54 # Return inference mode
55 if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2323 hf_hub_utils.enable_progress_bars()
2324 else:
-> 2325 return inner_training_loop(
2326 args=args,
2327 resume_from_checkpoint=resume_from_checkpoint,
/usr/local/lib/python3.11/dist-packages/unsloth_zoo/compiler.py in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
/kaggle/working/unsloth_compiled_cache/UnslothSFTTrainer.py in training_step(self, *args, **kwargs)
1023 def training_step(self, *args, **kwargs):
1024 with self.maybe_activation_offload_context:
-> 1025 return super().training_step(*args, **kwargs)
1026
1027 def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
/usr/local/lib/python3.11/dist-packages/unsloth/models/_utils.py in _unsloth_training_step(self, model, inputs, num_items_in_batch)
/kaggle/working/unsloth_compiled_cache/UnslothSFTTrainer.py in compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
1012
1013 def compute_loss(self, model, inputs, return_outputs = False, num_items_in_batch = None):
-> 1014 outputs = super().compute_loss(
1015 model,
1016 inputs,
/usr/local/lib/python3.11/dist-packages/unsloth/models/_utils.py in _unsloth_pre_compute_loss(self, model, inputs, *args, **kwargs)
1387 )
1388 pass
-> 1389 outputs = self._old_compute_loss(model, inputs, *args, **kwargs)
1390 return outputs
1391 pass
/usr/local/lib/python3.11/dist-packages/unsloth/models/_utils.py in compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1773 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1774 else:
-> 1775 return self._call_impl(*args, **kwargs)
1776
1777 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1784 or _global_backward_pre_hooks or _global_backward_hooks
1785 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1786 return forward_call(*args, **kwargs)
1787
1788 result = None
/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
816
817 def forward(*args, **kwargs):
--> 818 return model_forward(*args, **kwargs)
819
820 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
804
805 def __call__(self, *args, **kwargs):
--> 806 return convert_to_fp32(self.model_forward(*args, **kwargs))
807
808 def __getstate__(self):
/usr/local/lib/python3.11/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = ( # type: ignore[attr-defined]
/usr/local/lib/python3.11/dist-packages/peft/peft_model.py in forward(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)
1848 with self._enable_peft_forward_hooks(**kwargs):
1849 kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
-> 1850 return self.base_model(
1851 input_ids=input_ids,
1852 attention_mask=attention_mask,
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1773 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1774 else:
-> 1775 return self._call_impl(*args, **kwargs)
1776
1777 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1879
1880 try:
-> 1881 return inner()
1882 except Exception:
1883 # run always called hooks if they have not already been run
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in inner()
1827 args = bw_hook.setup_input_hook(args)
1828
-> 1829 result = forward_call(*args, **kwargs)
1830 if _global_forward_hooks or self._forward_hooks:
1831 for hook_id, hook in (
/usr/local/lib/python3.11/dist-packages/peft/tuners/tuners_utils.py in forward(self, *args, **kwargs)
220
221 def forward(self, *args: Any, **kwargs: Any):
--> 222 return self.model.forward(*args, **kwargs)
223
224 def _pre_injection_hook(self, model: nn.Module, config: PeftConfig, adapter_name: str) -> None:
/usr/local/lib/python3.11/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
173 output = module._old_forward(*args, **kwargs)
174 else:
--> 175 output = module._old_forward(*args, **kwargs)
176 return module._hf_hook.post_forward(module, output)
177
/kaggle/working/unsloth_compiled_cache/unsloth_compiled_module_qwen3_vl.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, cache_position, logits_to_keep, **kwargs)
1209 **kwargs: Unpack[TransformersKwargs],
1210 ) -> Union[tuple, Qwen3VLCausalLMOutputWithPast]:
-> 1211 return Qwen3VLForConditionalGeneration_forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, cache_position, logits_to_keep, **kwargs)
1212
1213 def prepare_inputs_for_generation(
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/external_utils.py in nonrecursive_disable_wrapper(*args, **kwargs)
194 @functools.wraps(fn)
195 def nonrecursive_disable_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
--> 196 return fn(*args, **kwargs)
197
198 return nonrecursive_disable_wrapper
/kaggle/working/unsloth_compiled_cache/unsloth_compiled_module_qwen3_vl.py in Qwen3VLForConditionalGeneration_forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, cache_position, logits_to_keep, **kwargs)
1114 torch._dynamo.mark_dynamic(_hidden_states, 1)
1115 torch._dynamo.mark_dynamic(labels, 1)
-> 1116 loss = unsloth_fused_ce_loss(
1117 trainer = None,
1118 hidden_states = _hidden_states,
/usr/local/lib/python3.11/dist-packages/unsloth_zoo/fused_losses/cross_entropy_loss.py in unsloth_fused_ce_loss(trainer, hidden_states, lm_head_weight, lm_head_bias, labels, mask, n_items, scaling, target_gb, torch_compile, overwrite, **kwargs)
362 scaling = scaler.get_scale() if scaler is not None else scaling
363 if hasattr(scaling, "get_scale"): scaling = scaling.get_scale()
--> 364 return apply_autograd_function(UnslothFusedLoss, dict(
365 loss_function = compute_fused_ce_loss,
366 hidden_states = hidden_states,
/usr/local/lib/python3.11/dist-packages/unsloth_zoo/fused_losses/cross_entropy_loss.py in apply_autograd_function(autograd, mapping)
39 def apply_autograd_function(autograd, mapping):
40 parameters, defaults = _get_mapping(autograd)
---> 41 return getattr(autograd, "apply")(*(
42 mapping.get(old_key, default) \
43 for old_key, default in zip(parameters, defaults)
/usr/local/lib/python3.11/dist-packages/torch/autograd/function.py in apply(cls, *args, **kwargs)
579 # See NOTE: [functorch vjp and autograd interaction]
580 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 581 return super().apply(*args, **kwargs) # type: ignore[misc]
582
583 if not is_setup_ctx_defined:
/usr/local/lib/python3.11/dist-packages/unsloth_zoo/fused_losses/cross_entropy_loss.py in forward(ctx, loss_function, hidden_states, lm_head_weight, lm_head_bias, labels, mask, n_items, scaling, shift_labels, target_gb, torch_compile, overwrite, extra_kwargs)
302 for (grad_inputs_j, hidden_states_j, labels_j,) in \
303 zip(__grad_inputs, __shift_states, __shift_labels,):
--> 304 accumulate_chunk(
305 n_chunks = n_chunks,
306 grad_inputs_j = grad_inputs_j,
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py in compile_wrapper(*args, **kwargs)
839 cur_exn.__cause__.with_traceback(None)
840 cur_exn = cur_exn.__cause__
--> 841 raise e.with_traceback(None) from e.__cause__ # User compiler error
842 except ShortenTraceback as e:
843 # Failures in the backend likely don't have useful
Unsupported: NotImplementedError/UnsupportedFakeTensorException when running FX node
Explanation: Dynamo failed to run FX node with fake tensors: call_function <function _autograd_grad at 0x7adc2d2d8180>(*((GradTrackingTensor(lvl=1, value=
FakeTensor(..., device='cuda:0', size=())
),), [GradTrackingTensor(lvl=1, value=
FakeTensor(..., device='cuda:1', size=(s97, 2048), dtype=torch.float16,
requires_grad=True)
)]), **{'create_graph': True}): got NotImplementedError('Cannot access storage of TensorWrapper')
Hint: If the op is a PyTorch op, please file an issue to PyTorch.
Developer debug context:
For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0087.html
from user code:
File "/usr/local/lib/python3.11/dist-packages/unsloth_zoo/fused_losses/cross_entropy_loss.py", line 276, in accumulate_chunk
(chunk_loss, (unscaled_loss,)) = torch.func.grad_and_value(
File "/usr/local/lib/python3.11/dist-packages/torch/_functorch/apis.py", line 449, in wrapper
return eager_transforms.grad_and_value_impl(
File "/usr/local/lib/python3.11/dist-packages/torch/_functorch/vmap.py", line 47, in fn
return f(*args, **kwargs)
File "/usr/local/lib/python3.11/dist-packages/torch/_functorch/eager_transforms.py", line 1390, in grad_and_value_impl
flat_grad_input = _autograd_grad(
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"I've Tried Setting LoRA Very Low Yet It Doesn't Work (On MultiGPU) So Not Just An OutOfMemory Error. And To Work On Single GPU LoRA Needs To Be Very Low Which Is Really Not Worth Tuning.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
feature requestFeature request pending on roadmapFeature request pending on roadmap