diff --git a/keras_hub/src/models/clip/clip_backbone.py b/keras_hub/src/models/clip/clip_backbone.py
index 653722b763..c7023010d3 100644
--- a/keras_hub/src/models/clip/clip_backbone.py
+++ b/keras_hub/src/models/clip/clip_backbone.py
@@ -29,7 +29,7 @@ class CLIPBackbone(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/clip/clip_text_encoder.py b/keras_hub/src/models/clip/clip_text_encoder.py
index da528c5e02..1b0fd26cf7 100644
--- a/keras_hub/src/models/clip/clip_text_encoder.py
+++ b/keras_hub/src/models/clip/clip_text_encoder.py
@@ -34,7 +34,7 @@ class CLIPTextEncoder(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
     """
 
     def __init__(
diff --git a/keras_hub/src/models/clip/clip_vision_encoder.py b/keras_hub/src/models/clip/clip_vision_encoder.py
index 33faac4947..a8e80deeeb 100644
--- a/keras_hub/src/models/clip/clip_vision_encoder.py
+++ b/keras_hub/src/models/clip/clip_vision_encoder.py
@@ -39,7 +39,7 @@ class CLIPVisionEncoder(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
     """
 
     def __init__(
diff --git a/keras_hub/src/models/depth_anything/depth_anything_backbone.py b/keras_hub/src/models/depth_anything/depth_anything_backbone.py
index 77e8a61e36..cfa49ef13d 100644
--- a/keras_hub/src/models/depth_anything/depth_anything_backbone.py
+++ b/keras_hub/src/models/depth_anything/depth_anything_backbone.py
@@ -50,7 +50,7 @@ class DepthAnythingBackbone(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/dinov2/dinov2_backbone.py b/keras_hub/src/models/dinov2/dinov2_backbone.py
index bff2aee16d..a67799c9ef 100644
--- a/keras_hub/src/models/dinov2/dinov2_backbone.py
+++ b/keras_hub/src/models/dinov2/dinov2_backbone.py
@@ -68,7 +68,7 @@ class DINOV2Backbone(FeaturePyramidBackbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/dinov3/dinov3_backbone.py b/keras_hub/src/models/dinov3/dinov3_backbone.py
index 5f52d9a509..6224ccb38a 100644
--- a/keras_hub/src/models/dinov3/dinov3_backbone.py
+++ b/keras_hub/src/models/dinov3/dinov3_backbone.py
@@ -63,7 +63,7 @@ class DINOV3Backbone(FeaturePyramidBackbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 3230d41862..4b371d2e6f 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -35,7 +35,16 @@ class FluxBackbone(Backbone):
         use_bias: bool. Whether to apply bias to the query, key, and value
             projections.
         guidance_embed: bool. If True, applies guidance embedding in the model.
-
+        image_shape: tuple[int]. Shape of the image input tensor. Defaults to
+            [(None, 768, 3072)].
+        text_shape: tuple[int]. Shape of the text input tensor. Defaults to
+            [(None, 768, 3072)].
+        image_ids_shape: tuple[int]. Shape of the image IDs input tensor.
+            Defaults to [(None, 768, 3072)].
+        text_ids_shape: tuple[int]. Shape of the text IDs input tensor.
+            Defaults to [(None, 768, 3072)].
+        y_shape: tuple[int]. Shape of the additional vector input tensor.
+            Defaults to `(None, 128)`.
     Call arguments:
         image: KerasTensor. Image input tensor of shape (N, L, D) where N is the
             batch size, L is the sequence length, and D is the feature
diff --git a/keras_hub/src/models/gemma/gemma_backbone.py b/keras_hub/src/models/gemma/gemma_backbone.py
index a1f5a4d1f7..fbbbd25f65 100644
--- a/keras_hub/src/models/gemma/gemma_backbone.py
+++ b/keras_hub/src/models/gemma/gemma_backbone.py
@@ -34,7 +34,7 @@ class GemmaBackbone(Backbone):
         intermediate_dim: int. The output dimension of the first Dense layer in
             a two-layer feedforward network for each transformer.
         head_dim: int. The size of each attention head.
-        layer_norm_epsilon: float. The epsilon value user for every layer norm
+        layer_norm_epsilon: float. The epsilon value used for every layer norm
             in the transformer model.
         dropout: float. Dropout probability for the Transformer encoder.
         query_head_dim_normalize: boolean. If `True` normalize the query before
@@ -55,7 +55,7 @@ class GemmaBackbone(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/gemma3/gemma3_backbone.py b/keras_hub/src/models/gemma3/gemma3_backbone.py
index 1c22852b10..00e28ae751 100644
--- a/keras_hub/src/models/gemma3/gemma3_backbone.py
+++ b/keras_hub/src/models/gemma3/gemma3_backbone.py
@@ -69,7 +69,7 @@ class Gemma3Backbone(Backbone):
         vision_encoder: A `Gemma3VisionEncoder` instance. `call()`
             takes in images and returns corresponding sequence of embeddings. If
             `None`, the model is a text-only model.
-        layer_norm_epsilon: float. The epsilon value user for every layer norm
+        layer_norm_epsilon: float. The epsilon value used for every layer norm
             in all transformer blocks. Defaults to `1e-6`.
         dropout: float. Dropout probability for the Transformer decoder blocks.
             Defaults to `0`.
diff --git a/keras_hub/src/models/gemma3/gemma3_vision_encoder.py b/keras_hub/src/models/gemma3/gemma3_vision_encoder.py
index f37ba0a2e9..f7da45c0c0 100644
--- a/keras_hub/src/models/gemma3/gemma3_vision_encoder.py
+++ b/keras_hub/src/models/gemma3/gemma3_vision_encoder.py
@@ -25,12 +25,12 @@ class Gemma3VisionEncoder(keras.Model):
         pool_size: int. Factors by which to downscale `(dim1, dim2)` in the
             average pooling layer. The same value is used for `"strides"`.
             Defaults to 14.
-        layer_norm_epsilon: float. The epsilon value user for every layer norm
+        layer_norm_epsilon: float. The epsilon value used for every layer norm
             in all transformer blocks. Defaults to `1e-6`.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/gpt2/gpt2_backbone.py b/keras_hub/src/models/gpt2/gpt2_backbone.py
index 3c661e7f80..e2cfda7198 100644
--- a/keras_hub/src/models/gpt2/gpt2_backbone.py
+++ b/keras_hub/src/models/gpt2/gpt2_backbone.py
@@ -47,7 +47,7 @@ class GPT2Backbone(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/gpt_oss/gpt_oss_backbone.py b/keras_hub/src/models/gpt_oss/gpt_oss_backbone.py
index 19800cfbda..1132a12347 100644
--- a/keras_hub/src/models/gpt_oss/gpt_oss_backbone.py
+++ b/keras_hub/src/models/gpt_oss/gpt_oss_backbone.py
@@ -47,7 +47,7 @@ class GptOssBackbone(Backbone):
         rope_max_wavelength: int. The maximum angular wavelength of
             the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
         rope_scaling_factor: float. The scaling factor for
-            calculation of roatary embedding. Defaults to `1.0`.
+            calculation of rotary embedding. Defaults to `1.0`.
         layer_norm_epsilon: float. Epsilon for the layer
             normalization layers in the transformer decoder. Defaults to `1e-6`.
         sliding_window: int. The sliding window for the attention
diff --git a/keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py b/keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py
index bed9831563..d53946aea7 100644
--- a/keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py
+++ b/keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py
@@ -157,7 +157,7 @@ def __init__(
         # Check valid pooling.
         else:
             raise ValueError(
-                "Unknown `pooling` type. Polling should be either `'avg'` or "
+                "Unknown `pooling` type. Pooling should be either `'avg'` or "
                 f"`'max'`. Received: pooling={pooling}."
             )
 
diff --git a/keras_hub/src/models/image_classifier.py b/keras_hub/src/models/image_classifier.py
index e75e390899..15f7b0c3e8 100644
--- a/keras_hub/src/models/image_classifier.py
+++ b/keras_hub/src/models/image_classifier.py
@@ -119,7 +119,7 @@ def __init__(
             )
         else:
             raise ValueError(
-                "Unknown `pooling` type. Polling should be either `'avg'` or "
+                "Unknown `pooling` type. Pooling should be either `'avg'` or "
                 f"`'max'`. Received: pooling={pooling}."
             )
         self.output_dropout = keras.layers.Dropout(
diff --git a/keras_hub/src/models/llama3/llama3_backbone.py b/keras_hub/src/models/llama3/llama3_backbone.py
index 207f021518..88a0b8ba63 100644
--- a/keras_hub/src/models/llama3/llama3_backbone.py
+++ b/keras_hub/src/models/llama3/llama3_backbone.py
@@ -33,7 +33,7 @@ class Llama3Backbone(LlamaBackbone):
         rope_max_wavelength (int, optional): The maximum angular wavelength of
             the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
         rope_position_scaling_factor (float, optional): The scaling factor for
-            calculation of roatary embedding. Defaults to `1.0`
+            calculation of rotary embedding. Defaults to `1.0`
         rope_requency_adjustment_factor (float, optional): The scaling factor
             used to scale the inverse frequencies.
         rope_low_freq_factor (float, optional): The low frequency factor.
diff --git a/keras_hub/src/models/mistral/mistral_backbone.py b/keras_hub/src/models/mistral/mistral_backbone.py
index 5f49127a7e..22be29da01 100644
--- a/keras_hub/src/models/mistral/mistral_backbone.py
+++ b/keras_hub/src/models/mistral/mistral_backbone.py
@@ -45,7 +45,7 @@ class MistralBackbone(Backbone):
         rope_max_wavelength (int, optional): The maximum angular wavelength of
             the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
         rope_scaling_factor (float, optional): The scaling factor for
-            calculation of roatary embedding. Defaults to `1.0`.
+            calculation of rotary embedding. Defaults to `1.0`.
         layer_norm_epsilon (float, optional): Epsilon for the layer
             normalization layers in the transformer decoder. Defaults to `1e-6`.
         sliding_window (int, optional): The sliding window for the mistral
@@ -53,6 +53,8 @@ class MistralBackbone(Backbone):
             attention layers in each transformer decoder. Only `sliding_window`
             number of tokens are saved in the cache and used to generate the
             next token. Defaults to `512`.
+        dropout (float, optional): Dropout probability for the Transformer
+            decoder blocks. Defaults to `0`.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for model computations and weights. Note that some computations,
             such as softmax and layer normalization, will always be done at
diff --git a/keras_hub/src/models/mistral/mistral_transformer_decoder.py b/keras_hub/src/models/mistral/mistral_transformer_decoder.py
index 79d5e93f7a..531549e3aa 100644
--- a/keras_hub/src/models/mistral/mistral_transformer_decoder.py
+++ b/keras_hub/src/models/mistral/mistral_transformer_decoder.py
@@ -207,7 +207,7 @@ def _compute_self_attention_mask(
             else self_attention_cache_update_index
         )
 
-        # The lower traingular attention mask
+        # The lower triangular attention mask
         causal_mask = compute_causal_mask(
             batch_size, input_length, output_length, cache_update_index
         )
diff --git a/keras_hub/src/models/mixtral/mixtral_backbone.py b/keras_hub/src/models/mixtral/mixtral_backbone.py
index 797504eb4e..28cbcf5a78 100644
--- a/keras_hub/src/models/mixtral/mixtral_backbone.py
+++ b/keras_hub/src/models/mixtral/mixtral_backbone.py
@@ -44,7 +44,7 @@ class MixtralBackbone(Backbone):
         rope_max_wavelength (int, optional): The maximum angular wavelength of
             the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
         rope_scaling_factor (float, optional): The scaling factor for
-            calculation of roatary embedding. Defaults to `1.0`.
+            calculation of rotary embedding. Defaults to `1.0`.
         layer_norm_epsilon (float, optional): Epsilon for the layer
             normalization layers in the transformer decoder. Defaults to `1e-6`.
         sliding_window (int, optional): The sliding window for the mixtral
diff --git a/keras_hub/src/models/mixtral/mixtral_decoder.py b/keras_hub/src/models/mixtral/mixtral_decoder.py
index d9d35bbb16..3a380a2d71 100644
--- a/keras_hub/src/models/mixtral/mixtral_decoder.py
+++ b/keras_hub/src/models/mixtral/mixtral_decoder.py
@@ -447,7 +447,7 @@ def _compute_self_attention_mask(
             else self_attention_cache_update_index
         )
 
-        # The lower traingular attention mask
+        # The lower triangular attention mask
         causal_mask = compute_causal_mask(
             batch_size, input_length, output_length, cache_update_index
         )
diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py b/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py
index 7ed13c3317..a044241dd5 100644
--- a/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py
+++ b/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py
@@ -55,7 +55,7 @@ class PaliGemmaBackbone(Backbone):
             in a two-layer feedforward network for vision transformer. Defaults
             to `4304`.
         vit_pooling: `None` or string. The encoded vision embeddings are pooled
-            using the specified polling setting. The accepted values are
+            using the specified pooling setting. The accepted values are
             `"map"`, `"gap"`, `"0"` or `None`. Defaults to `None`.
         vit_classifier_activation: activation function. The activation that
             is used for final output classification in the vision transformer.
@@ -76,14 +76,14 @@ class PaliGemmaBackbone(Backbone):
           window attention. Defaults to `False`.
         sliding_window_size: int. Size of the sliding local window. Defaults to
             `4096`.
-        layer_norm_epsilon: float. The epsilon value user for every layer norm
+        layer_norm_epsilon: float. The epsilon value used for every layer norm
             in all transformer blocks. Defaults to `1e-6`.
         dropout: float. Dropout probability for the Transformer decoder blocks.
             Defaults to `0`.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
@@ -100,7 +100,7 @@ class PaliGemmaBackbone(Backbone):
     # Randomly initialized PaliGemma decoder with custom config.
     model = keras_hub.models.PaliGemmaBackbone(
         vocabulary_size=50257,
-        images_size=224,
+        image_size=224,
         num_layers=12,
         num_query_heads=12,
         num_key_value_heads=1,
diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py
index 2812a497bc..4534b0c38c 100644
--- a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py
+++ b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py
@@ -426,14 +426,14 @@ class PaliGemmaVit(keras.Model):
             as a image classifier, this value would correspond to the number of
             output classes.
         pooling: string. The encoded vision embeddings are pooled using the
-            specified polling setting. The accepted values are `"map"`, `"gap"`,
+            specified pooling setting. The accepted values are `"map"`, `"gap"`,
             `"zero"` or `None`. Defaults to `None`.
-        classifier_activation: activation fucntion. The activation that is used
-            for final output classification
+        classifier_activation: activation function. The activation that is used
+            for final output classification.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/siglip/siglip_backbone.py b/keras_hub/src/models/siglip/siglip_backbone.py
index 7629f4d7b5..3676750e8a 100644
--- a/keras_hub/src/models/siglip/siglip_backbone.py
+++ b/keras_hub/src/models/siglip/siglip_backbone.py
@@ -31,7 +31,7 @@ class SigLIPBackbone(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/siglip/siglip_text_encoder.py b/keras_hub/src/models/siglip/siglip_text_encoder.py
index 82f640ce9f..d325fd6674 100644
--- a/keras_hub/src/models/siglip/siglip_text_encoder.py
+++ b/keras_hub/src/models/siglip/siglip_text_encoder.py
@@ -32,7 +32,7 @@ class SigLIPTextEncoder(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
     """
 
     def __init__(
diff --git a/keras_hub/src/models/siglip/siglip_vision_encoder.py b/keras_hub/src/models/siglip/siglip_vision_encoder.py
index b68e03342d..07563ac34d 100644
--- a/keras_hub/src/models/siglip/siglip_vision_encoder.py
+++ b/keras_hub/src/models/siglip/siglip_vision_encoder.py
@@ -41,7 +41,7 @@ class SigLIPVisionEncoder(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
     """
 
     def __init__(
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py
index 974439178b..0a7c65f5d4 100644
--- a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py
+++ b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py
@@ -222,7 +222,7 @@ class StableDiffusion3Backbone(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype.
 
     Example:
     ```python
diff --git a/keras_hub/src/models/vgg/vgg_image_classifier.py b/keras_hub/src/models/vgg/vgg_image_classifier.py
index a72b256288..3c1929a5b8 100644
--- a/keras_hub/src/models/vgg/vgg_image_classifier.py
+++ b/keras_hub/src/models/vgg/vgg_image_classifier.py
@@ -142,7 +142,7 @@ def __init__(
             )
         else:
             raise ValueError(
-                "Unknown `pooling` type. Polling should be either `'avg'` or "
+                "Unknown `pooling` type. Pooling should be either `'avg'` or "
                 f"`'max'`. Received: pooling={pooling}."
             )
 
diff --git a/keras_hub/src/samplers/random_sampler.py b/keras_hub/src/samplers/random_sampler.py
index 368f7ca71e..07fa95accc 100644
--- a/keras_hub/src/samplers/random_sampler.py
+++ b/keras_hub/src/samplers/random_sampler.py
@@ -45,8 +45,10 @@ def __init__(
 
     def get_next_token(self, probabilities):
         # Sample the next token from the probability distribution.
+        # tf does not support half precision multinomial sampling, so make
+        # sure we have full precision here.
         next_token_id = random.categorical(
-            ops.log(probabilities),
+            ops.cast(ops.log(probabilities), "float32"),
             1,
             seed=self.seed_generator,
             dtype="int32",
diff --git a/keras_hub/src/samplers/top_p_sampler.py b/keras_hub/src/samplers/top_p_sampler.py
index 4477acaf77..8a83c3d5f6 100644
--- a/keras_hub/src/samplers/top_p_sampler.py
+++ b/keras_hub/src/samplers/top_p_sampler.py
@@ -79,7 +79,9 @@ def get_next_token(self, probabilities):
             ops.zeros(ops.shape(sorted_preds), dtype=sorted_preds.dtype),
         )
         sorted_next_token = random.categorical(
-            ops.log(probabilities),
+            # tf does not support half precision multinomial sampling, so make
+            # sure we have full precision here.
+            ops.cast(ops.log(probabilities), "float32"),
             1,
             seed=self.seed_generator,
             dtype="int32",