-
Notifications
You must be signed in to change notification settings - Fork 330
Port Qwen2.5-VL Model #2574
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Port Qwen2.5-VL Model #2574
Changes from 5 commits
9876249
fc181bf
309fb75
565f2d9
bd58f5f
a6676d9
b382cab
bd94a92
9145d81
417d20b
404dd11
ecf15b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| import keras | ||
| from keras_hub.src.models.backbone import Backbone | ||
| from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone | ||
|
||
| from keras_hub.src.models.qwen2_vl.qwen2_vl_vision_encoder import Qwen2VLVisionEncoder | ||
| from keras_hub.src.models.qwen2_vl.qwen2_vl_projector import Qwen2VLProjector | ||
|
|
||
| class Qwen2VLBackbone(Backbone): | ||
jaytiwarihub marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def __init__( | ||
| self, | ||
| vision_encoder, | ||
| projector, | ||
| text_backbone, | ||
| **kwargs | ||
| ): | ||
| super().__init__(**kwargs) | ||
| self.vision_encoder = vision_encoder | ||
| self.projector = projector | ||
| self.text_backbone = text_backbone # This is the standard Qwen (2/2.5) LLM | ||
|
|
||
| def call(self, inputs): | ||
| # inputs is a dict containing "images" and "token_ids" | ||
| images = inputs["images"] | ||
| token_ids = inputs["token_ids"] | ||
|
|
||
| # Process Images | ||
| image_features = self.vision_encoder(images) | ||
|
|
||
| # Project Images to Text Space | ||
| image_embeddings = self.projector(image_features) | ||
|
|
||
| # Process Text (Get embeddings normally) | ||
| text_embeddings = self.text_backbone.token_embedding(token_ids) | ||
|
|
||
| # FUSE THEM (Placeholder concatenation) | ||
| combined_embeddings = keras.ops.concatenate([image_embeddings, text_embeddings], axis=1) | ||
|
|
||
| # Pass through the LLM | ||
| x = self.text_backbone.transformer_layers(combined_embeddings) | ||
|
||
| x = self.text_backbone.layer_norm(x) | ||
|
|
||
| return x | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| import keras | ||
| from keras import layers | ||
| from keras import ops | ||
|
|
||
| class Qwen2VLProjector(layers.Layer): | ||
jaytiwarihub marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """ | ||
| Projector layer for Qwen2-VL. | ||
|
|
||
| This layer downsamples vision features by merging 2x2 neighboring patches | ||
| into a single token and projecting them to the LLM's hidden size. | ||
| """ | ||
| def __init__(self, hidden_size, output_hidden_size, **kwargs): | ||
| super().__init__(**kwargs) | ||
| self.hidden_size = hidden_size | ||
| self.output_hidden_size = output_hidden_size | ||
|
|
||
| self.merger = layers.Sequential([ | ||
| layers.Dense(output_hidden_size, name="merger_proj"), | ||
| layers.Activation("gelu", name="activation"), | ||
| layers.Dense(output_hidden_size, name="output_proj") | ||
| ], name="merger") | ||
|
|
||
| def call(self, x): | ||
| # x shape: (Batch, Height, Width, Channels) | ||
|
|
||
| input_shape = ops.shape(x) | ||
| H, W, C = input_shape[1], input_shape[2], input_shape[3] | ||
|
|
||
| # Reshape to isolate 2x2 blocks | ||
| # Shape: (B, H/2, 2, W/2, 2, C) | ||
| x = ops.reshape(x, (-1, H // 2, 2, W // 2, 2, C)) | ||
|
|
||
| # Permute to bring the 2x2 blocks together | ||
| # Shape: (B, H/2, W/2, 2, 2, C) | ||
| x = ops.transpose(x, (0, 1, 3, 2, 4, 5)) | ||
|
|
||
| # Flatten the 2x2xC block into a single vector | ||
| # Shape: (B, H/2, W/2, 4*C) | ||
| x = ops.reshape(x, (-1, H // 2, W // 2, 4 * C)) | ||
|
|
||
| x = self.merger(x) | ||
|
|
||
| return x | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| import keras | ||
| from keras import layers | ||
| from keras import ops | ||
|
|
||
| from keras_hub.src.layers.modeling.reversible_embedding import ReversibleEmbedding | ||
| from keras_hub.src.models.backbone import Backbone | ||
|
|
||
|
|
||
| class Qwen2VLVisionEncoder(Backbone): | ||
jaytiwarihub marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def __init__( | ||
| self, | ||
| patch_size=14, | ||
| temporal_patch_size=2, | ||
| hidden_size=1152, | ||
| depth=27, | ||
| num_heads=16, | ||
| mlp_ratio=4, | ||
| activation="silu", | ||
| **kwargs, | ||
| ): | ||
| super().__init__(**kwargs) | ||
| self.patch_size = patch_size | ||
| self.temporal_patch_size = temporal_patch_size | ||
| self.hidden_size = hidden_size | ||
| self.depth = depth | ||
| self.num_heads = num_heads | ||
| self.mlp_ratio = mlp_ratio | ||
| self.activation = activation | ||
|
|
||
| # 3D convolution to handle both Video (Time) and Images | ||
| self.patch_embed = layers.Conv3D( | ||
| filters=hidden_size, | ||
| kernel_size=(temporal_patch_size, patch_size, patch_size), | ||
| strides=(temporal_patch_size, patch_size, patch_size), | ||
| padding="valid", | ||
| name="patch_embed", | ||
| ) | ||
|
|
||
| # Placeholder for Qwen2VL transformer blocks | ||
| self.blocks = [ | ||
| Qwen2VLVisionBlock(hidden_size, num_heads, mlp_ratio, activation, name=f"blocks.{i}") | ||
| for i in range(depth) | ||
| ] | ||
|
|
||
| # Patch Merger to downsample tokens before sending to LLM | ||
| self.merger = layers.Conv2D( | ||
| filters=hidden_size, | ||
| kernel_size=2, | ||
| strides=2, | ||
| padding="valid", | ||
| name="merger", | ||
| ) | ||
|
|
||
| def call(self, x, grid_thw=None): | ||
| # x shape: (Batch, Time, Height, Width, Channels) | ||
| x = self.patch_embed(x) | ||
|
||
|
|
||
| # Note: 3D-RoPE implementation pending | ||
|
|
||
| for block in self.blocks: | ||
| x = block(x, grid_thw=grid_thw) | ||
|
|
||
| x = self.merger(x) | ||
|
|
||
| return x | ||
|
|
||
| class Qwen2VLVisionBlock(layers.Layer): | ||
jaytiwarihub marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def __init__(self, hidden_size, num_heads, mlp_ratio, activation, **kwargs): | ||
| super().__init__(**kwargs) | ||
| self.norm1 = layers.LayerNormalization(epsilon=1e-6) | ||
| self.attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_size//num_heads) | ||
| self.norm2 = layers.LayerNormalization(epsilon=1e-6) | ||
| self.mlp = layers.Dense(hidden_size * mlp_ratio) | ||
|
|
||
| def call(self, x, grid_thw=None): | ||
| residual = x | ||
| x = self.norm1(x) | ||
| x = self.attn(x, x) | ||
| x = x + residual | ||
|
|
||
| residual = x | ||
| x = self.norm2(x) | ||
| x = self.mlp(x) | ||
| x = x + residual | ||
| return x | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
batched = Trueassignment is a duplicate of the one on line 681 and can be removed.