diff --git a/keras_hub/src/models/clip/clip_backbone.py b/keras_hub/src/models/clip/clip_backbone.py index 653722b763..c7023010d3 100644 --- a/keras_hub/src/models/clip/clip_backbone.py +++ b/keras_hub/src/models/clip/clip_backbone.py @@ -29,7 +29,7 @@ class CLIPBackbone(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/clip/clip_text_encoder.py b/keras_hub/src/models/clip/clip_text_encoder.py index da528c5e02..1b0fd26cf7 100644 --- a/keras_hub/src/models/clip/clip_text_encoder.py +++ b/keras_hub/src/models/clip/clip_text_encoder.py @@ -34,7 +34,7 @@ class CLIPTextEncoder(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. """ def __init__( diff --git a/keras_hub/src/models/clip/clip_vision_encoder.py b/keras_hub/src/models/clip/clip_vision_encoder.py index 33faac4947..a8e80deeeb 100644 --- a/keras_hub/src/models/clip/clip_vision_encoder.py +++ b/keras_hub/src/models/clip/clip_vision_encoder.py @@ -39,7 +39,7 @@ class CLIPVisionEncoder(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. """ def __init__( diff --git a/keras_hub/src/models/depth_anything/depth_anything_backbone.py b/keras_hub/src/models/depth_anything/depth_anything_backbone.py index 77e8a61e36..cfa49ef13d 100644 --- a/keras_hub/src/models/depth_anything/depth_anything_backbone.py +++ b/keras_hub/src/models/depth_anything/depth_anything_backbone.py @@ -50,7 +50,7 @@ class DepthAnythingBackbone(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/dinov2/dinov2_backbone.py b/keras_hub/src/models/dinov2/dinov2_backbone.py index bff2aee16d..a67799c9ef 100644 --- a/keras_hub/src/models/dinov2/dinov2_backbone.py +++ b/keras_hub/src/models/dinov2/dinov2_backbone.py @@ -68,7 +68,7 @@ class DINOV2Backbone(FeaturePyramidBackbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/dinov3/dinov3_backbone.py b/keras_hub/src/models/dinov3/dinov3_backbone.py index 5f52d9a509..6224ccb38a 100644 --- a/keras_hub/src/models/dinov3/dinov3_backbone.py +++ b/keras_hub/src/models/dinov3/dinov3_backbone.py @@ -63,7 +63,7 @@ class DINOV3Backbone(FeaturePyramidBackbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py index 3230d41862..4b371d2e6f 100644 --- a/keras_hub/src/models/flux/flux_model.py +++ b/keras_hub/src/models/flux/flux_model.py @@ -35,7 +35,16 @@ class FluxBackbone(Backbone): use_bias: bool. Whether to apply bias to the query, key, and value projections. guidance_embed: bool. If True, applies guidance embedding in the model. - + image_shape: tuple[int]. Shape of the image input tensor. Defaults to + [(None, 768, 3072)]. + text_shape: tuple[int]. Shape of the text input tensor. Defaults to + [(None, 768, 3072)]. + image_ids_shape: tuple[int]. Shape of the image IDs input tensor. + Defaults to [(None, 768, 3072)]. + text_ids_shape: tuple[int]. Shape of the text IDs input tensor. + Defaults to [(None, 768, 3072)]. + y_shape: tuple[int]. Shape of the additional vector input tensor. + Defaults to `(None, 128)`. Call arguments: image: KerasTensor. Image input tensor of shape (N, L, D) where N is the batch size, L is the sequence length, and D is the feature diff --git a/keras_hub/src/models/gemma/gemma_backbone.py b/keras_hub/src/models/gemma/gemma_backbone.py index a1f5a4d1f7..fbbbd25f65 100644 --- a/keras_hub/src/models/gemma/gemma_backbone.py +++ b/keras_hub/src/models/gemma/gemma_backbone.py @@ -34,7 +34,7 @@ class GemmaBackbone(Backbone): intermediate_dim: int. The output dimension of the first Dense layer in a two-layer feedforward network for each transformer. head_dim: int. The size of each attention head. - layer_norm_epsilon: float. The epsilon value user for every layer norm + layer_norm_epsilon: float. The epsilon value used for every layer norm in the transformer model. dropout: float. Dropout probability for the Transformer encoder. query_head_dim_normalize: boolean. If `True` normalize the query before @@ -55,7 +55,7 @@ class GemmaBackbone(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/gemma3/gemma3_backbone.py b/keras_hub/src/models/gemma3/gemma3_backbone.py index 1c22852b10..00e28ae751 100644 --- a/keras_hub/src/models/gemma3/gemma3_backbone.py +++ b/keras_hub/src/models/gemma3/gemma3_backbone.py @@ -69,7 +69,7 @@ class Gemma3Backbone(Backbone): vision_encoder: A `Gemma3VisionEncoder` instance. `call()` takes in images and returns corresponding sequence of embeddings. If `None`, the model is a text-only model. - layer_norm_epsilon: float. The epsilon value user for every layer norm + layer_norm_epsilon: float. The epsilon value used for every layer norm in all transformer blocks. Defaults to `1e-6`. dropout: float. Dropout probability for the Transformer decoder blocks. Defaults to `0`. diff --git a/keras_hub/src/models/gemma3/gemma3_vision_encoder.py b/keras_hub/src/models/gemma3/gemma3_vision_encoder.py index f37ba0a2e9..f7da45c0c0 100644 --- a/keras_hub/src/models/gemma3/gemma3_vision_encoder.py +++ b/keras_hub/src/models/gemma3/gemma3_vision_encoder.py @@ -25,12 +25,12 @@ class Gemma3VisionEncoder(keras.Model): pool_size: int. Factors by which to downscale `(dim1, dim2)` in the average pooling layer. The same value is used for `"strides"`. Defaults to 14. - layer_norm_epsilon: float. The epsilon value user for every layer norm + layer_norm_epsilon: float. The epsilon value used for every layer norm in all transformer blocks. Defaults to `1e-6`. dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/gpt2/gpt2_backbone.py b/keras_hub/src/models/gpt2/gpt2_backbone.py index 3c661e7f80..e2cfda7198 100644 --- a/keras_hub/src/models/gpt2/gpt2_backbone.py +++ b/keras_hub/src/models/gpt2/gpt2_backbone.py @@ -47,7 +47,7 @@ class GPT2Backbone(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/gpt_oss/gpt_oss_backbone.py b/keras_hub/src/models/gpt_oss/gpt_oss_backbone.py index 19800cfbda..1132a12347 100644 --- a/keras_hub/src/models/gpt_oss/gpt_oss_backbone.py +++ b/keras_hub/src/models/gpt_oss/gpt_oss_backbone.py @@ -47,7 +47,7 @@ class GptOssBackbone(Backbone): rope_max_wavelength: int. The maximum angular wavelength of the sine/cosine curves, for rotary embeddings. Defaults to `10000`. rope_scaling_factor: float. The scaling factor for - calculation of roatary embedding. Defaults to `1.0`. + calculation of rotary embedding. Defaults to `1.0`. layer_norm_epsilon: float. Epsilon for the layer normalization layers in the transformer decoder. Defaults to `1e-6`. sliding_window: int. The sliding window for the attention diff --git a/keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py b/keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py index bed9831563..d53946aea7 100644 --- a/keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py +++ b/keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py @@ -157,7 +157,7 @@ def __init__( # Check valid pooling. else: raise ValueError( - "Unknown `pooling` type. Polling should be either `'avg'` or " + "Unknown `pooling` type. Pooling should be either `'avg'` or " f"`'max'`. Received: pooling={pooling}." ) diff --git a/keras_hub/src/models/image_classifier.py b/keras_hub/src/models/image_classifier.py index e75e390899..15f7b0c3e8 100644 --- a/keras_hub/src/models/image_classifier.py +++ b/keras_hub/src/models/image_classifier.py @@ -119,7 +119,7 @@ def __init__( ) else: raise ValueError( - "Unknown `pooling` type. Polling should be either `'avg'` or " + "Unknown `pooling` type. Pooling should be either `'avg'` or " f"`'max'`. Received: pooling={pooling}." ) self.output_dropout = keras.layers.Dropout( diff --git a/keras_hub/src/models/llama3/llama3_backbone.py b/keras_hub/src/models/llama3/llama3_backbone.py index 207f021518..88a0b8ba63 100644 --- a/keras_hub/src/models/llama3/llama3_backbone.py +++ b/keras_hub/src/models/llama3/llama3_backbone.py @@ -33,7 +33,7 @@ class Llama3Backbone(LlamaBackbone): rope_max_wavelength (int, optional): The maximum angular wavelength of the sine/cosine curves, for rotary embeddings. Defaults to `10000`. rope_position_scaling_factor (float, optional): The scaling factor for - calculation of roatary embedding. Defaults to `1.0` + calculation of rotary embedding. Defaults to `1.0` rope_requency_adjustment_factor (float, optional): The scaling factor used to scale the inverse frequencies. rope_low_freq_factor (float, optional): The low frequency factor. diff --git a/keras_hub/src/models/mistral/mistral_backbone.py b/keras_hub/src/models/mistral/mistral_backbone.py index 5f49127a7e..22be29da01 100644 --- a/keras_hub/src/models/mistral/mistral_backbone.py +++ b/keras_hub/src/models/mistral/mistral_backbone.py @@ -45,7 +45,7 @@ class MistralBackbone(Backbone): rope_max_wavelength (int, optional): The maximum angular wavelength of the sine/cosine curves, for rotary embeddings. Defaults to `10000`. rope_scaling_factor (float, optional): The scaling factor for - calculation of roatary embedding. Defaults to `1.0`. + calculation of rotary embedding. Defaults to `1.0`. layer_norm_epsilon (float, optional): Epsilon for the layer normalization layers in the transformer decoder. Defaults to `1e-6`. sliding_window (int, optional): The sliding window for the mistral @@ -53,6 +53,8 @@ class MistralBackbone(Backbone): attention layers in each transformer decoder. Only `sliding_window` number of tokens are saved in the cache and used to generate the next token. Defaults to `512`. + dropout (float, optional): Dropout probability for the Transformer + decoder blocks. Defaults to `0`. dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for model computations and weights. Note that some computations, such as softmax and layer normalization, will always be done at diff --git a/keras_hub/src/models/mistral/mistral_transformer_decoder.py b/keras_hub/src/models/mistral/mistral_transformer_decoder.py index 79d5e93f7a..531549e3aa 100644 --- a/keras_hub/src/models/mistral/mistral_transformer_decoder.py +++ b/keras_hub/src/models/mistral/mistral_transformer_decoder.py @@ -207,7 +207,7 @@ def _compute_self_attention_mask( else self_attention_cache_update_index ) - # The lower traingular attention mask + # The lower triangular attention mask causal_mask = compute_causal_mask( batch_size, input_length, output_length, cache_update_index ) diff --git a/keras_hub/src/models/mixtral/mixtral_backbone.py b/keras_hub/src/models/mixtral/mixtral_backbone.py index 797504eb4e..28cbcf5a78 100644 --- a/keras_hub/src/models/mixtral/mixtral_backbone.py +++ b/keras_hub/src/models/mixtral/mixtral_backbone.py @@ -44,7 +44,7 @@ class MixtralBackbone(Backbone): rope_max_wavelength (int, optional): The maximum angular wavelength of the sine/cosine curves, for rotary embeddings. Defaults to `10000`. rope_scaling_factor (float, optional): The scaling factor for - calculation of roatary embedding. Defaults to `1.0`. + calculation of rotary embedding. Defaults to `1.0`. layer_norm_epsilon (float, optional): Epsilon for the layer normalization layers in the transformer decoder. Defaults to `1e-6`. sliding_window (int, optional): The sliding window for the mixtral diff --git a/keras_hub/src/models/mixtral/mixtral_decoder.py b/keras_hub/src/models/mixtral/mixtral_decoder.py index d9d35bbb16..3a380a2d71 100644 --- a/keras_hub/src/models/mixtral/mixtral_decoder.py +++ b/keras_hub/src/models/mixtral/mixtral_decoder.py @@ -447,7 +447,7 @@ def _compute_self_attention_mask( else self_attention_cache_update_index ) - # The lower traingular attention mask + # The lower triangular attention mask causal_mask = compute_causal_mask( batch_size, input_length, output_length, cache_update_index ) diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py b/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py index 7ed13c3317..a044241dd5 100644 --- a/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +++ b/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py @@ -55,7 +55,7 @@ class PaliGemmaBackbone(Backbone): in a two-layer feedforward network for vision transformer. Defaults to `4304`. vit_pooling: `None` or string. The encoded vision embeddings are pooled - using the specified polling setting. The accepted values are + using the specified pooling setting. The accepted values are `"map"`, `"gap"`, `"0"` or `None`. Defaults to `None`. vit_classifier_activation: activation function. The activation that is used for final output classification in the vision transformer. @@ -76,14 +76,14 @@ class PaliGemmaBackbone(Backbone): window attention. Defaults to `False`. sliding_window_size: int. Size of the sliding local window. Defaults to `4096`. - layer_norm_epsilon: float. The epsilon value user for every layer norm + layer_norm_epsilon: float. The epsilon value used for every layer norm in all transformer blocks. Defaults to `1e-6`. dropout: float. Dropout probability for the Transformer decoder blocks. Defaults to `0`. dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python @@ -100,7 +100,7 @@ class PaliGemmaBackbone(Backbone): # Randomly initialized PaliGemma decoder with custom config. model = keras_hub.models.PaliGemmaBackbone( vocabulary_size=50257, - images_size=224, + image_size=224, num_layers=12, num_query_heads=12, num_key_value_heads=1, diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py index 2812a497bc..4534b0c38c 100644 --- a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py +++ b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py @@ -426,14 +426,14 @@ class PaliGemmaVit(keras.Model): as a image classifier, this value would correspond to the number of output classes. pooling: string. The encoded vision embeddings are pooled using the - specified polling setting. The accepted values are `"map"`, `"gap"`, + specified pooling setting. The accepted values are `"map"`, `"gap"`, `"zero"` or `None`. Defaults to `None`. - classifier_activation: activation fucntion. The activation that is used - for final output classification + classifier_activation: activation function. The activation that is used + for final output classification. dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/siglip/siglip_backbone.py b/keras_hub/src/models/siglip/siglip_backbone.py index 7629f4d7b5..3676750e8a 100644 --- a/keras_hub/src/models/siglip/siglip_backbone.py +++ b/keras_hub/src/models/siglip/siglip_backbone.py @@ -31,7 +31,7 @@ class SigLIPBackbone(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/siglip/siglip_text_encoder.py b/keras_hub/src/models/siglip/siglip_text_encoder.py index 82f640ce9f..d325fd6674 100644 --- a/keras_hub/src/models/siglip/siglip_text_encoder.py +++ b/keras_hub/src/models/siglip/siglip_text_encoder.py @@ -32,7 +32,7 @@ class SigLIPTextEncoder(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. """ def __init__( diff --git a/keras_hub/src/models/siglip/siglip_vision_encoder.py b/keras_hub/src/models/siglip/siglip_vision_encoder.py index b68e03342d..07563ac34d 100644 --- a/keras_hub/src/models/siglip/siglip_vision_encoder.py +++ b/keras_hub/src/models/siglip/siglip_vision_encoder.py @@ -41,7 +41,7 @@ class SigLIPVisionEncoder(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. """ def __init__( diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py index 974439178b..0a7c65f5d4 100644 --- a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +++ b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py @@ -222,7 +222,7 @@ class StableDiffusion3Backbone(Backbone): dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use for the models computations and weights. Note that some computations, such as softmax and layer normalization will always - be done a float32 precision regardless of dtype. + be done in float32 precision regardless of dtype. Example: ```python diff --git a/keras_hub/src/models/vgg/vgg_image_classifier.py b/keras_hub/src/models/vgg/vgg_image_classifier.py index a72b256288..3c1929a5b8 100644 --- a/keras_hub/src/models/vgg/vgg_image_classifier.py +++ b/keras_hub/src/models/vgg/vgg_image_classifier.py @@ -142,7 +142,7 @@ def __init__( ) else: raise ValueError( - "Unknown `pooling` type. Polling should be either `'avg'` or " + "Unknown `pooling` type. Pooling should be either `'avg'` or " f"`'max'`. Received: pooling={pooling}." ) diff --git a/keras_hub/src/samplers/random_sampler.py b/keras_hub/src/samplers/random_sampler.py index 368f7ca71e..07fa95accc 100644 --- a/keras_hub/src/samplers/random_sampler.py +++ b/keras_hub/src/samplers/random_sampler.py @@ -45,8 +45,10 @@ def __init__( def get_next_token(self, probabilities): # Sample the next token from the probability distribution. + # tf does not support half precision multinomial sampling, so make + # sure we have full precision here. next_token_id = random.categorical( - ops.log(probabilities), + ops.cast(ops.log(probabilities), "float32"), 1, seed=self.seed_generator, dtype="int32", diff --git a/keras_hub/src/samplers/top_p_sampler.py b/keras_hub/src/samplers/top_p_sampler.py index 4477acaf77..8a83c3d5f6 100644 --- a/keras_hub/src/samplers/top_p_sampler.py +++ b/keras_hub/src/samplers/top_p_sampler.py @@ -79,7 +79,9 @@ def get_next_token(self, probabilities): ops.zeros(ops.shape(sorted_preds), dtype=sorted_preds.dtype), ) sorted_next_token = random.categorical( - ops.log(probabilities), + # tf does not support half precision multinomial sampling, so make + # sure we have full precision here. + ops.cast(ops.log(probabilities), "float32"), 1, seed=self.seed_generator, dtype="int32",