diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
index 5d1126a40d..96999abb5f 100644
--- a/keras_hub/api/models/__init__.py
+++ b/keras_hub/api/models/__init__.py
@@ -458,6 +458,15 @@
 from keras_hub.src.models.mobilenetv5.mobilenetv5_image_classifier_preprocessor import (
     MobileNetV5ImageClassifierPreprocessor as MobileNetV5ImageClassifierPreprocessor,
 )
+from keras_hub.src.models.moondream.moondream_backbone import (
+    MoondreamBackbone as MoondreamBackbone,
+)
+from keras_hub.src.models.moondream.moondream_causal_lm import (
+    MoondreamCausalLM as MoondreamCausalLM,
+)
+from keras_hub.src.models.moondream.moondream_preprocessor import (
+    MoondreamPreprocessor as MoondreamPreprocessor,
+)
 from keras_hub.src.models.moonshine.moonshine_audio_to_text import (
     MoonshineAudioToText as MoonshineAudioToText,
 )
diff --git a/keras_hub/src/models/moondream/__init__.py b/keras_hub/src/models/moondream/__init__.py
new file mode 100644
index 0000000000..dbbab8df7b
--- /dev/null
+++ b/keras_hub/src/models/moondream/__init__.py
@@ -0,0 +1,4 @@
+from keras_hub.src.models.moondream.moondream_backbone import MoondreamBackbone
+from keras_hub.src.models.moondream.moondream_preprocessor import (
+    MoondreamPreprocessor,
+)
diff --git a/keras_hub/src/models/moondream/moondream_backbone.py b/keras_hub/src/models/moondream/moondream_backbone.py
new file mode 100644
index 0000000000..bec6fa372c
--- /dev/null
+++ b/keras_hub/src/models/moondream/moondream_backbone.py
@@ -0,0 +1,135 @@
+import keras
+from keras import ops
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.backbone import Backbone
+
+
+@keras_hub_export("keras_hub.models.MoondreamBackbone")
+class MoondreamBackbone(Backbone):
+    """
+    The Moondream Backbone model.
+
+    This model connects a vision encoder (SigLIP) and a text decoder (Phi-1.5)
+    using a projection layer. It is designed for vision-language tasks where
+    image features are projected into the text embedding space.
+
+    Args:
+        vision_encoder: A Keras model (e.g., SigLIP). The vision encoder
+            responsible for processing input images.
+        text_decoder: A Keras model (e.g., Phi-1.5). The text decoder
+            responsible for generating text tokens.
+        projection_dim: int. The dimension to project image features into.
+            Defaults to `2048`.
+        **kwargs: Standard Keras keyword arguments.
+
+    Example:
+    ```python
+    import keras
+    import numpy as np
+    from keras_hub.src.models.moondream.moondream_backbone import (
+        MoondreamBackbone
+    )
+
+    # 1. Create Mock Encoders
+    # Vision Encoder: Maps (378, 378, 3) -> (729, 1152)
+    image_input = keras.Input(shape=(378, 378, 3))
+    vision_output = keras.layers.Lambda(
+        lambda x: keras.ops.ones((keras.ops.shape(x)[0], 729, 1152))
+    )(image_input)
+    vision_encoder = keras.Model(inputs=image_input, outputs=vision_output)
+
+    # Text Decoder: Maps (Seq,) -> (Seq, 2048)
+    text_input = keras.Input(shape=(None,), dtype="int32")
+    text_output = keras.layers.Lambda(
+        lambda x: keras.ops.ones(
+            (keras.ops.shape(x)[0], keras.ops.shape(x)[1], 2048)
+        )
+    )(text_input)
+    text_decoder = keras.Model(inputs=text_input, outputs=text_output)
+
+    # Helper for embeddings
+    text_decoder.get_input_embeddings = lambda x: keras.layers.Embedding(
+        50000, 2048
+    )(x)
+
+    # 2. Instantiate Backbone
+    backbone = MoondreamBackbone(
+        vision_encoder=vision_encoder,
+        text_decoder=text_decoder,
+        projection_dim=2048
+    )
+
+    # 3. Run Forward Pass
+    inputs = {
+        "images": np.random.rand(2, 378, 378, 3),
+        "token_ids": np.random.randint(0, 50000, (2, 10)),
+        "padding_mask": np.ones((2, 10))
+    }
+    outputs = backbone(inputs)
+    ```
+    """
+
+    def __init__(
+        self, vision_encoder, text_decoder, projection_dim=2048, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vision_encoder = vision_encoder
+        self.text_decoder = text_decoder
+        self.projection_dim = projection_dim
+
+        self.vision_projection = keras.layers.Dense(
+            projection_dim, name="vision_projection"
+        )
+
+        images = keras.Input(shape=(None, None, 3), name="images")
+        token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids")
+        padding_mask = keras.Input(
+            shape=(None,), dtype="int32", name="padding_mask"
+        )
+
+        inputs = {
+            "images": images,
+            "token_ids": token_ids,
+            "padding_mask": padding_mask,
+        }
+
+        image_features = self.vision_encoder(images)
+        projected_images = self.vision_projection(image_features)
+
+        text_embeddings = self.text_decoder.get_input_embeddings(token_ids)
+
+        combined_embeddings = ops.concatenate(
+            [projected_images, text_embeddings], axis=1
+        )
+
+        batch_size = ops.shape(images)[0]
+        num_patches = ops.shape(projected_images)[1]
+
+        image_mask = ops.ones((batch_size, num_patches), dtype="int32")
+        combined_mask = ops.concatenate([image_mask, padding_mask], axis=1)
+
+        outputs = self.text_decoder(
+            inputs=None,
+            decoder_inputs_embeds=combined_embeddings,
+            padding_mask=combined_mask,
+        )
+
+        super(MoondreamBackbone, self).__init__(
+            inputs=inputs, outputs=outputs, **kwargs
+        )
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "vision_encoder": keras.saving.serialize_keras_object(
+                    self.vision_encoder
+                ),
+                "text_decoder": keras.saving.serialize_keras_object(
+                    self.text_decoder
+                ),
+                "projection_dim": self.projection_dim,
+            }
+        )
+        return config
diff --git a/keras_hub/src/models/moondream/moondream_causal_lm.py b/keras_hub/src/models/moondream/moondream_causal_lm.py
new file mode 100644
index 0000000000..955e0d7368
--- /dev/null
+++ b/keras_hub/src/models/moondream/moondream_causal_lm.py
@@ -0,0 +1,86 @@
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.causal_lm import CausalLM
+from keras_hub.src.models.moondream.moondream_backbone import MoondreamBackbone
+from keras_hub.src.models.moondream.moondream_preprocessor import (
+    MoondreamPreprocessor,
+)
+
+
+@keras_hub_export("keras_hub.models.MoondreamCausalLM")
+class MoondreamCausalLM(CausalLM):
+    """
+    An end-to-end Moondream model for causal language modeling.
+
+    This model wraps `MoondreamBackbone` and handles the complete flow from
+    raw inputs (images + text) to generated text output. It provides a
+    high-level interface for image captioning and visual question answering.
+
+    Args:
+        backbone: A `MoondreamBackbone` instance. The backbone model that
+            connects the vision encoder and text decoder.
+        preprocessor: A `MoondreamPreprocessor` instance. Handles data
+            preprocessing (tokenization and image resizing).
+        **kwargs: Standard Keras keyword arguments.
+
+    Example:
+    ```python
+    import keras
+    import numpy as np
+    from keras_hub.src.models.moondream.moondream_backbone import (
+        MoondreamBackbone
+    )
+    from keras_hub.src.models.moondream.moondream_causal_lm import (
+        MoondreamCausalLM
+    )
+
+    # 1. Setup Mock Backbone
+    images = keras.Input(shape=(None, None, 3), name="images")
+    token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids")
+    padding_mask = keras.Input(
+        shape=(None,), dtype="int32", name="padding_mask"
+    )
+
+    outputs = keras.layers.Dense(2048)(token_ids)
+
+    backbone = keras.Model(
+        inputs={
+            "images": images,
+            "token_ids": token_ids,
+            "padding_mask": padding_mask
+        },
+        outputs=outputs
+    )
+
+    # 2. Instantiate CausalLM
+    model = MoondreamCausalLM(backbone=backbone)
+
+    # 3. Run Forward Pass
+    inputs = {
+        "images": np.random.rand(2, 378, 378, 3),
+        "token_ids": np.random.randint(0, 100, (2, 10)),
+        "padding_mask": np.ones((2, 10))
+    }
+    outputs = model(inputs)
+    ```
+    """
+
+    backbone_cls = MoondreamBackbone
+    preprocessor_cls = MoondreamPreprocessor
+
+    def __init__(
+        self,
+        backbone,
+        preprocessor=None,
+        **kwargs,
+    ):
+        inputs = backbone.input
+        outputs = backbone(inputs)
+
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            **kwargs,
+        )
+
+        self.backbone = backbone
+        self.preprocessor = preprocessor
diff --git a/keras_hub/src/models/moondream/moondream_preprocessor.py b/keras_hub/src/models/moondream/moondream_preprocessor.py
new file mode 100644
index 0000000000..f961afd07d
--- /dev/null
+++ b/keras_hub/src/models/moondream/moondream_preprocessor.py
@@ -0,0 +1,136 @@
+import keras
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
+
+
+@keras_hub_export("keras_hub.models.MoondreamPreprocessor")
+class MoondreamPreprocessor(CausalLMPreprocessor):
+    """
+    Moondream Causal LM Preprocessor.
+
+    This class handles the preprocessing of images and text for the Moondream
+    model. It combines image resizing/rescaling logic with text tokenization
+    to prepare inputs for the model.
+
+    Args:
+        tokenizer: The tokenizer to be used for text inputs.
+        image_converter: An optional layer or callable for image preprocessing
+            (e.g., resizing, normalization).
+        sequence_length: int. The context length for tokenization.
+            Defaults to 1024.
+        add_start_token: bool. Whether to add the start token.
+            Defaults to True.
+        add_end_token: bool. Whether to add the end token.
+            Defaults to True.
+        **kwargs: Standard Keras keyword arguments.
+
+    Example:
+    ```python
+    import keras
+    import numpy as np
+    from keras_hub.src.models.moondream.moondream_preprocessor import (
+        MoondreamPreprocessor
+    )
+
+    # 1. Create a Mock Tokenizer
+    class MockTokenizer:
+        def __call__(self, x):
+            return keras.ops.convert_to_tensor([[1, 2, 3]] * len(x))
+        def detokenize(self, x):
+            return x
+        pass
+
+    tokenizer = MockTokenizer()
+
+    # 2. Create an Image Converter
+    image_converter = keras.layers.Resizing(height=378, width=378)
+
+    # 3. Instantiate Preprocessor
+    preprocessor = MoondreamPreprocessor(
+        tokenizer=tokenizer,
+        image_converter=image_converter,
+        sequence_length=128
+    )
+
+    # 4. Preprocess Data
+    inputs = {
+        "images": np.random.randint(0, 255, (2, 500, 500, 3)),
+        "text": ["Describe this image.", "What is in the photo?"]
+    }
+
+    outputs = preprocessor(inputs)
+    ```
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        image_converter=None,
+        sequence_length=1024,
+        add_start_token=True,
+        add_end_token=True,
+        **kwargs,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            sequence_length=sequence_length,
+            add_start_token=add_start_token,
+            add_end_token=add_end_token,
+            **kwargs,
+        )
+        self.image_converter = image_converter
+
+    def call(self, x, y=None, sample_weight=None):
+        if isinstance(x, dict):
+            text_input = x.get("text", "")
+            images = x.get("images", None)
+        else:
+            text_input = x
+            images = None
+
+        output = super().call(text_input, y=y, sample_weight=sample_weight)
+
+        if isinstance(output, tuple):
+            x_out = output[0]
+        else:
+            x_out = output
+
+        if images is not None:
+            if self.image_converter:
+                images = self.image_converter(images)
+
+            if isinstance(x_out, dict):
+                x_out["images"] = images
+
+        return output
+
+    def generate_preprocess(self, x, sequence_length=None):
+        if isinstance(x, dict):
+            text_input = x.get("text", "")
+            images = x.get("images", None)
+        else:
+            text_input = x
+            images = None
+
+        output = super().generate_preprocess(
+            text_input, sequence_length=sequence_length
+        )
+
+        if images is not None:
+            if self.image_converter:
+                images = self.image_converter(images)
+            output["images"] = images
+
+        return output
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "image_converter": keras.saving.serialize_keras_object(
+                    self.image_converter
+                ),
+            }
+        )
+        return config