CWE 476 llava, gpt2, falcon, cohere, all_model (#2208)

karol-brejna-i · astachowiczhabana · regisss · web-flow · commit 0e9e7780ee38 · 2025-09-09T17:50:48.000+02:00
Co-authored-by: Adam Stachowicz &lt;105052242+astachowiczhabana@users.noreply.github.com&gt;
Co-authored-by: regisss &lt;15324346+regisss@users.noreply.github.com&gt;
diff --git a/optimum/habana/transformers/models/cohere/modeling_cohere.py b/optimum/habana/transformers/models/cohere/modeling_cohere.py
@@ -323,7 +323,7 @@ def prepare_inputs_for_generation(
             # The clone here is for the same reason as for `position_ids`.
             model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        if isinstance(past_key_values, StaticCache) and attention_mask is not None and attention_mask.ndim == 2:
             if model_inputs["inputs_embeds"] is not None:
                 batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
                 device = model_inputs["inputs_embeds"].device
diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py
@@ -364,7 +364,13 @@ def pre_attn_forward(
         else:
             kv_length = present[0][-2] if reuse_cache else present[0].shape[-2]
 
-        if (not reuse_cache) and (token_idx is not None) and (cache_idx is not None) and (query_length == 1):
+        if (
+            (not reuse_cache)
+            and (token_idx is not None)
+            and (cache_idx is not None)
+            and (query_length == 1)
+            and (present is not None)
+        ):
             # Return only past key value shapes and not the tensors during decode phase (q len is 1)
             # to avoid making past key values as persistent output tensors of HPU graphs.
             present = (present[0].shape, present[1].shape)
diff --git a/optimum/habana/transformers/models/gpt_neo/modeling_gpt_neo.py b/optimum/habana/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -157,7 +157,7 @@ def gaudi_gpt_neo_model_forward(
     use_cache = use_cache if use_cache is not None else self.config.use_cache
     return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-    if input_ids is not None and inputs_embeds is not None:
+    if (input_ids is None) == (inputs_embeds is None):
         raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
     elif input_ids is not None:
         self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
diff --git a/optimum/habana/transformers/models/llava/modeling_llava.py b/optimum/habana/transformers/models/llava/modeling_llava.py
@@ -165,7 +165,7 @@ def forward(
 
         image_features = None
         # 2. Merge text and images
-        if pixel_values is not None and input_ids.shape[1] != 1:
+        if pixel_values is not None and input_ids is not None and input_ids.shape[1] != 1:
             image_outputs = self.vision_tower(
                 pixel_values,
                 output_hidden_states=True,
diff --git a/optimum/habana/transformers/models/modeling_all_models.py b/optimum/habana/transformers/models/modeling_all_models.py
@@ -113,8 +113,10 @@ def gaudi_invert_attention_mask(self, encoder_attention_mask: torch.Tensor) -> t
     """
     if encoder_attention_mask.dim() == 3:
         encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-    if encoder_attention_mask.dim() == 2:
+    elif encoder_attention_mask.dim() == 2:
         encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+    else:
+        raise ValueError(f"encoder_attention_mask must be 2D or 3D, but got shape {encoder_attention_mask.shape}")
     # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
     # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
     # /transformer/transformer_layers.py#L270