instructlab
diff --git a/‎src/instructlab/training/accelerator.py‎
Lines changed: 28 additions & 4 deletions b/‎src/instructlab/training/accelerator.py‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎src/instructlab/training/model.py‎
Lines changed: 136 additions & 36 deletions b/‎src/instructlab/training/model.py‎
Lines changed: 136 additions & 36 deletions
@@ -135,17 +135,41 @@ def __getattr__(self, name):
 
     def get_fsdp_config(self):
         is_lora = self.model.lora_config is not None
-        block_name = next(iter(self.model._no_split_modules))
 
         wrap_policy = None
         if is_lora > 0:
             wrap_policy = fsdp_auto_wrap_policy(self.model)
         else:
+            # Resolve all _no_split_modules names to actual classes present
+            # in the model. Some models (e.g. Qwen3.5) declare module names
+            # for architectures not loaded (e.g. vision blocks in a CausalLM),
+            # so we must filter out None results.
+            layer_classes = set()
+            # Try resolving against the wrapper model first, then the
+            # underlying HF model if the first pass yields nothing.
+            targets = [self.model]
+            hf_model = getattr(self.model, "model", None)
+            if hf_model is not None:
+                targets.append(hf_model)
+
+            for target in targets:
+                for block_name in self.model._no_split_modules:
+                    cls = get_module_class_from_name(target, block_name)
+                    if cls is not None:
+                        layer_classes.add(cls)
+                if layer_classes:
+                    break
+
+            if not layer_classes:
+                logger.warning(
+                    "Could not resolve any _no_split_modules "
+                    "(%s) to actual module classes in the model. "
+                    "FSDP will use the default wrap policy.",
+                    self.model._no_split_modules,
+                )
             wrap_policy = partial(
                 transformer_auto_wrap_policy,
-                transformer_layer_cls={
-                    get_module_class_from_name(self.model, block_name),
-                },
+                transformer_layer_cls=layer_classes,
             )
 
         # TODO(osilkin): BACKWARD_POST trades memory utilization for processing time, which is important for systems utilizing LoRA
 
@@ -45,6 +45,14 @@
 )
 from instructlab.training.gpt_oss_utils_correct import is_gpt_oss, is_known_model
 from instructlab.training.type_definitions import ModelInputs, ModelLosses
+from instructlab.training.vlm_utils import (
+    extract_causal_lm_from_vlm,
+    has_timm_vision_tower,
+    is_vlm_for_direct_loading,
+    is_vlm_with_causal_lm,
+    load_vlm_for_text_training,
+    needs_sdpa,
+)
 
 
 class Model:
@@ -67,6 +75,13 @@ def __init__(
         # check model type & set on the mclasss
         self.is_granitemoehybrid = is_known_model(model_path, "granitemoehybrid")
         self.is_gpt_oss = is_gpt_oss(model_path)
+
+        # Pre-populate the Hub kernel cache with locally installed mamba_ssm
+        # and causal_conv1d to avoid PyTorch/CUDA ABI mismatches with the
+        # Hub-provided kernel builds.
+        if self.is_granitemoehybrid:
+            self._use_local_mamba_kernels()
+
         if self.is_gpt_oss:
             # Third Party
             quant_config = Mxfp4Config(dequantize=True)
@@ -95,12 +110,92 @@ def __init__(
 
         # set flash attention accordingly
         if flash_enabled:
-            self.base_model_args["attn_implementation"] = "flash_attention_2"
-            if self.is_gpt_oss:
-                self.base_model_args["attn_implementation"] = (
-                    "kernels-community/vllm-flash-attn3"
+            # Some models are incompatible with Flash Attention 2:
+            # - M-RoPE models produce 3D position_ids that FA2 misinterprets
+            # - Models with timm vision towers (TimmWrapperModel rejects FA2)
+            # Detect these and fall back to SDPA.
+            use_sdpa = needs_sdpa(model_path)
+            if use_sdpa:
+                logger.warning(
+                    "Disabling flash_attention_2 — model is incompatible "
+                    "(M-RoPE or timm vision tower). Using SDPA instead."
+                )
+                self.base_model_args["attn_implementation"] = "sdpa"
+            else:
+                self.base_model_args["attn_implementation"] = "flash_attention_2"
+                if self.is_gpt_oss:
+                    # vllm-flash-attn3 requires Hopper (SM 9.0+) GPUs;
+                    # GPT-OSS only supports flash-attn3 or eager
+                    device = (
+                        torch.cuda.current_device() if torch.cuda.is_available() else 0
+                    )
+                    major, _ = torch.cuda.get_device_capability(device)
+                    if major >= 9:
+                        self.base_model_args["attn_implementation"] = (
+                            "kernels-community/vllm-flash-attn3"
+                        )
+                    else:
+                        self.base_model_args["attn_implementation"] = "eager"
+                        logger.warning(
+                            "GPT-OSS: flash-attn3 requires Hopper (SM 9.0+) GPUs, "
+                            "but found SM %d.x. Using eager attention instead.",
+                            major,
+                        )
+
+            # For models with timm vision towers: set vision config to eager
+            # while keeping the text model's attention implementation.
+            # timm's TimmWrapperModel rejects both FA2 and SDPA.
+            if has_timm_vision_tower(model_path):
+                attn_impl = self.base_model_args.get(
+                    "attn_implementation", "flash_attention_2"
+                )
+                self.base_model_args["attn_implementation"] = {
+                    "text_config": attn_impl,
+                    "vision_config": "eager",
+                }
+                logger.info(
+                    "Model has timm vision tower — using eager attention for vision, "
+                    "%s for text model.",
+                    attn_impl,
                 )
 
+    @staticmethod
+    def _use_local_mamba_kernels():
+        """Use locally installed mamba_ssm/causal_conv1d instead of Hub kernels.
+
+        Pre-populate the transformers Hub kernel cache with the locally
+        installed packages to avoid PyTorch/CUDA ABI mismatches with the
+        Hub-provided kernel builds.
+        """
+        try:
+            # Third Party
+            from mamba_ssm.ops.triton.selective_state_update import (
+                selective_state_update,
+            )
+            from mamba_ssm.ops.triton.ssd_combined import (
+                mamba_chunk_scan_combined,
+                mamba_split_conv1d_scan_combined,
+            )
+            from transformers.integrations.hub_kernels import _KERNEL_MODULE_MAPPING
+            import causal_conv1d
+            import mamba_ssm
+
+            mamba_ssm.selective_state_update = selective_state_update
+            mamba_ssm.mamba_chunk_scan_combined = mamba_chunk_scan_combined
+            mamba_ssm.mamba_split_conv1d_scan_combined = (
+                mamba_split_conv1d_scan_combined
+            )
+
+            _KERNEL_MODULE_MAPPING["causal-conv1d"] = causal_conv1d
+            _KERNEL_MODULE_MAPPING["mamba-ssm"] = mamba_ssm
+            logger.info("Using local mamba_ssm/causal_conv1d instead of Hub kernels")
+        except (ImportError, AttributeError) as e:
+            logger.warning(
+                "Could not patch mamba kernels (%s); "
+                "GraniteMoeHybrid may use Hub kernels",
+                e,
+            )
+
     def _post_model_init(self):
         """Common initialization steps that should happen after model initialization."""
         self.reconcile_tokenizer()
@@ -271,61 +366,61 @@ def _is_causal_lm_model(self) -> bool:
             bool: True if the model is a causal language model, False otherwise.
         """
         # Third Party
-        return "ForCausalLM" in self.model.__class__.__name__
+        class_name = self.model.__class__.__name__
+        return "ForCausalLM" in class_name or "ForConditionalGeneration" in class_name
+
+    def _get_text_config(self):
+        """Get the text-relevant config, falling back to text_config for VLMs."""
+        config = self.model.config
+        if not hasattr(config, "vocab_size") and hasattr(config, "text_config"):
+            return config.text_config
+        return config
 
     def reconcile_tokenizer(self):
-        if len(self.tokenizer) > self.model.config.vocab_size:
+        text_config = self._get_text_config()
+        if len(self.tokenizer) > text_config.vocab_size:
             logger.warning(
-                f"WARNING: tokenizer has {len(self.tokenizer)} tokens but model has {self.model.config.vocab_size} vocab size"
+                f"WARNING: tokenizer has {len(self.tokenizer)} tokens but model has {text_config.vocab_size} vocab size"
             )
             self.model.resize_token_embeddings(
                 int(8 * math.ceil(len(self.tokenizer) / 8.0))
             )  # make the vocab size multiple of 8 for sharding the embedding layer.
 
         # Fix any discrepancy between model and tokenizer
         if (
-            self.model.config.pad_token_id is not None
+            text_config.pad_token_id is not None
             and self.tokenizer.pad_token_id is not None
-            and self.model.config.pad_token_id != self.tokenizer.pad_token_id
+            and text_config.pad_token_id != self.tokenizer.pad_token_id
         ):
             logger.warning(
-                f"WARNING: There is a mismatch between pad token id of model ({self.model.config.pad_token_id}) and tokenizer({self.tokenizer.pad_token_id}). Fixing model pad token id to be same as tokenizer's pad token id"
+                f"WARNING: There is a mismatch between pad token id of model ({text_config.pad_token_id}) and tokenizer({self.tokenizer.pad_token_id}). Fixing model pad token id to be same as tokenizer's pad token id"
             )
-            self.model.config.pad_token_id = self.tokenizer.pad_token_id
+            text_config.pad_token_id = self.tokenizer.pad_token_id
         if (
-            self.model.config.bos_token_id is not None
+            text_config.bos_token_id is not None
             and self.tokenizer.bos_token_id is not None
-            and self.model.config.bos_token_id != self.tokenizer.bos_token_id
+            and text_config.bos_token_id != self.tokenizer.bos_token_id
         ):
             logging.warning(
-                f"WARNING: There is a mismatch between bos token id of model({self.model.config.bos_token_id}) and tokenizer({self.tokenizer.bos_token_id}). Fixing model bos token id to be same as tokenizer's bos token id"
+                f"WARNING: There is a mismatch between bos token id of model({text_config.bos_token_id}) and tokenizer({self.tokenizer.bos_token_id}). Fixing model bos token id to be same as tokenizer's bos token id"
             )
-            self.model.config.bos_token_id = self.tokenizer.bos_token_id
+            text_config.bos_token_id = self.tokenizer.bos_token_id
         if (
-            self.model.config.eos_token_id is not None
-            and self.tokenizer.eos_token_id
-            and self.model.config.eos_token_id != self.tokenizer.eos_token_id
+            text_config.eos_token_id is not None
+            and self.tokenizer.eos_token_id is not None
+            and text_config.eos_token_id != self.tokenizer.eos_token_id
         ):
             logger.warning(
-                f"WARNING: There is a mismatch between eos token id of model({self.model.config.eos_token_id}) and tokenizer({self.tokenizer.eos_token_id}). Fixing model eos token id to be same as tokenizer's eos token id"
+                f"WARNING: There is a mismatch between eos token id of model({text_config.eos_token_id}) and tokenizer({self.tokenizer.eos_token_id}). Fixing model eos token id to be same as tokenizer's eos token id"
             )
-            self.model.config.eos_token_id = self.tokenizer.eos_token_id
+            text_config.eos_token_id = self.tokenizer.eos_token_id
 
-        if (
-            self.tokenizer.pad_token_id is not None
-            and self.model.config.pad_token_id is None
-        ):
-            self.model.config.pad_token_id = self.tokenizer.pad_token_id
-        if (
-            self.tokenizer.bos_token_id is not None
-            and self.model.config.bos_token_id is None
-        ):
-            self.model.config.bos_token_id = self.tokenizer.bos_token_id
-        if (
-            self.tokenizer.eos_token_id is not None
-            and self.model.config.eos_token_id is None
-        ):
-            self.model.config.eos_token_id = self.tokenizer.eos_token_id
+        if self.tokenizer.pad_token_id is not None and text_config.pad_token_id is None:
+            text_config.pad_token_id = self.tokenizer.pad_token_id
+        if self.tokenizer.bos_token_id is not None and text_config.bos_token_id is None:
+            text_config.bos_token_id = self.tokenizer.bos_token_id
+        if self.tokenizer.eos_token_id is not None and text_config.eos_token_id is None:
+            text_config.eos_token_id = self.tokenizer.eos_token_id
 
         if not self._is_causal_lm_model():
             raise ValueError(
@@ -501,7 +596,12 @@ def __init__(
             lora_config=lora_config,
             lora_quant_bits=lora_quant_bits,
         )
-        self.model = AutoModelForCausalLM.from_pretrained(**self.base_model_args)
+        if is_vlm_with_causal_lm(model_path):
+            self.model = extract_causal_lm_from_vlm(model_path, self.base_model_args)
+        elif is_vlm_for_direct_loading(model_path):
+            self.model = load_vlm_for_text_training(model_path, self.base_model_args)
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(**self.base_model_args)
         self._post_model_init()
         self.model.gradient_checkpointing_enable()