sgl-project
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sglang_omni/engines/omni/factory.py‎
Lines changed: 5 additions & 1 deletion b/‎sglang_omni/engines/omni/factory.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎sglang_omni/engines/omni/runtime/sglang_ar.py‎
Lines changed: 37 additions & 5 deletions b/‎sglang_omni/engines/omni/runtime/sglang_ar.py‎
Lines changed: 37 additions & 5 deletions
diff --git a/‎sglang_omni/engines/omni/runtime/thinker_forward.py‎
Lines changed: 3 additions & 0 deletions b/‎sglang_omni/engines/omni/runtime/thinker_forward.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎sglang_omni/models/ming_omni/components/image_gen_executor.py‎
Lines changed: 13 additions & 7 deletions b/‎sglang_omni/models/ming_omni/components/image_gen_executor.py‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎sglang_omni/models/ming_omni/components/preprocessor.py‎
Lines changed: 14 additions & 20 deletions b/‎sglang_omni/models/ming_omni/components/preprocessor.py‎
Lines changed: 14 additions & 20 deletions
diff --git a/‎sglang_omni/models/ming_omni/config.py‎
Lines changed: 1 addition & 1 deletion b/‎sglang_omni/models/ming_omni/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sglang_omni/models/ming_omni/diffusion/backend.py‎
Lines changed: 0 additions & 2 deletions b/‎sglang_omni/models/ming_omni/diffusion/backend.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎sglang_omni/models/ming_omni/diffusion/bailing_moe_config.py‎
Lines changed: 5 additions & 1 deletion b/‎sglang_omni/models/ming_omni/diffusion/bailing_moe_config.py‎
Lines changed: 5 additions & 1 deletion
@@ -56,6 +56,8 @@ dependencies = [
     "torchaudio",
     # Gradio playground
     "gradio>=4.0.0",
+    # Ming-Omni
+    "diffusers==0.37.1",
     # flash-attn: install separately via prebuilt wheel (see install instructions below)
 ]
 
 
@@ -330,7 +330,11 @@ def create_sglang_ar_engine(
     output_proc = SGLangOutputProcessor(
         capture_hidden=capture_hidden,
         capture_hidden_layers=capture_hidden_layers,
-        model=model_worker.model_runner.model if capture_hidden_layers else None,
+        model=(
+            model_worker.model_runner.model
+            if (capture_hidden or capture_hidden_layers)
+            else None
+        ),
     )
 
     if stream_adapter is None:
 
@@ -355,7 +355,9 @@ def process(
             token_id = token_list[i] if i < len(token_list) else None
             extra = None
             if hidden_states_dict is not None:
-                if "_single" in hidden_states_dict:
+                if "_full" in hidden_states_dict:
+                    extra = {"hidden_states": hidden_states_dict["_full"]}
+                elif "_single" in hidden_states_dict:
                     extra = {"hidden_states": hidden_states_dict["_single"][i]}
                 else:
                     per_req = {}
@@ -382,10 +384,20 @@ def _extract_hidden_states(
         """Extract hidden states from model output or side-channel.
 
         Priority:
-        1. Side-channel (_captured_aux_hidden_states) from hidden capture hooks
-        2. logits_output.hidden_states (legacy single-tensor path)
+        1. Full-sequence side-channel (_captured_full_hidden_states) for
+           image_gen prefill-only — preserves the full [seq_len, hidden_dim]
+           tensor so downstream can apply gen_mask.
+        2. Side-channel (_captured_aux_hidden_states) from hidden capture hooks
+        3. logits_output.hidden_states (legacy single-tensor path)
         """
-        # Check side-channel first (set by _hidden_capture hooks)
+        # Full-sequence capture (set by BailingMoeV2ForCausalLM.forward)
+        if self._model is not None:
+            full_hs = getattr(self._model, "_captured_full_hidden_states", None)
+            if full_hs is not None:
+                self._model._captured_full_hidden_states = None
+                return {"_full": full_hs}
+
+        # Side-channel from _hidden_capture hooks
         if self._model is not None and self._capture_hidden_layers:
             aux = getattr(self._model, "_captured_aux_hidden_states", None)
             if aux is not None:
@@ -471,14 +483,18 @@ def update_request(self, request: SchedulerRequest, output: RequestOutput) -> No
             )
 
         if req.is_chunked > 0:
+            # Accumulate full-sequence hidden states across chunks so
+            # image_gen prefill-only gets the complete [seq_len, hidden_dim].
+            if output.extra:
+                self._accumulate_hidden_states(data, output.extra)
             output.data = None
             req.is_chunked -= 1
             return
 
         # Transfer captured model outputs (e.g. hidden states) to the
         # request data so they're available to downstream pipeline stages.
         if output.extra:
-            data.extra_model_outputs.update(output.extra)
+            self._accumulate_hidden_states(data, output.extra)
 
         token_id = output.data
         if token_id is not None:
@@ -499,6 +515,22 @@ def update_request(self, request: SchedulerRequest, output: RequestOutput) -> No
                 req.finished(),
             )
 
+    @staticmethod
+    def _accumulate_hidden_states(data, extra: dict) -> None:
+        """Merge extra into data.extra_model_outputs, concatenating hidden_states tensors."""
+        hs = extra.get("hidden_states")
+        if hs is not None and isinstance(hs, torch.Tensor):
+            prev = data.extra_model_outputs.get("hidden_states")
+            if prev is not None and isinstance(prev, torch.Tensor):
+                data.extra_model_outputs["hidden_states"] = torch.cat([prev, hs], dim=0)
+            else:
+                data.extra_model_outputs["hidden_states"] = hs
+            rest = {k: v for k, v in extra.items() if k != "hidden_states"}
+            if rest:
+                data.extra_model_outputs.update(rest)
+        else:
+            data.extra_model_outputs.update(extra)
+
     def is_finished(self, request: SchedulerRequest, output: RequestOutput) -> bool:
         return request.data.req.finished()
 
 
@@ -50,6 +50,9 @@ def thinker_forward_omni(
         input_deepstack_embeds=ds_input,
     )
 
+    if getattr(forward_batch, "capture_hidden_mode", None) is not None:
+        outer_model._captured_full_hidden_states = hidden_states.clone()
+
     return outer_model.logits_processor(
         forward_batch.input_ids,
         hidden_states,
 
@@ -22,7 +22,10 @@
 import torch
 
 from sglang_omni.executors.interface import Executor
-from sglang_omni.models.ming_omni.diffusion.backend import DiffusionBackend, ImageGenParams
+from sglang_omni.models.ming_omni.diffusion.backend import (
+    DiffusionBackend,
+    ImageGenParams,
+)
 from sglang_omni.proto import StagePayload
 
 logger = logging.getLogger(__name__)
@@ -91,14 +94,14 @@ def _load_models(self) -> None:
                 skip_semantic_encoder=True,
             )
         else:
-            self._backend.load_models(
-                self._dit_model_path, torch.device(self._device)
-            )
+            self._backend.load_models(self._dit_model_path, torch.device(self._device))
         logger.info("[IMG_GEN] Diffusion backend loaded in %.1fs", time.time() - t0)
 
         # Load thinker tokenizer for decoding output_ids → text prompt
         try:
-            from sglang_omni.models.ming_omni.components.common import load_ming_tokenizer
+            from sglang_omni.models.ming_omni.components.common import (
+                load_ming_tokenizer,
+            )
 
             self._thinker_tokenizer = load_ming_tokenizer(self._model_path)
             logger.info(
@@ -276,7 +279,9 @@ def _extract_input(self, payload: StagePayload) -> tuple[str, ImageGenParams]:
         if isinstance(thinker_out, dict):
             output_ids = thinker_out.get("output_ids", [])
             if output_ids and self._thinker_tokenizer is not None:
-                text = self._thinker_tokenizer.decode(output_ids, skip_special_tokens=True)
+                text = self._thinker_tokenizer.decode(
+                    output_ids, skip_special_tokens=True
+                )
 
         # Fallback: pre-decoded text
         if not text:
@@ -354,7 +359,8 @@ def _try_condition_from_hidden_states(
         if isinstance(hidden_states, dict):
             # Side-channel capture: pick the last (highest) layer
             numeric_keys = [
-                k for k in hidden_states
+                k
+                for k in hidden_states
                 if isinstance(k, int) or (isinstance(k, str) and k.isdigit())
             ]
             if not numeric_keys:
 
@@ -164,6 +164,17 @@ def __init__(self, model_path: str, conditioner=None):
         # Lazy-init image processor
         self._image_processor = None
 
+        # Image generation conditioner (optional)
+        self._conditioner = conditioner
+        if conditioner is not None:
+            self._image_patch_token_id = conditioner.image_patch_token
+            self._image_start_token_id = conditioner.image_start_token
+            self._image_end_token_id = conditioner.image_end_token
+        else:
+            self._image_patch_token_id = None
+            self._image_start_token_id = None
+            self._image_end_token_id = None
+
     def _get_image_processor(self):
         """Lazy-init Qwen2VLImageProcessor (same processor as Ming-Omni uses)."""
         if self._image_processor is None:
@@ -199,17 +210,6 @@ def _process_images(
         )
         return pixel_values, image_grid_thw, token_counts
 
-        # Image generation conditioner (optional)
-        self._conditioner = conditioner
-        if conditioner is not None:
-            self._image_patch_token_id = conditioner.image_patch_token
-            self._image_start_token_id = conditioner.image_start_token
-            self._image_end_token_id = conditioner.image_end_token
-        else:
-            self._image_patch_token_id = None
-            self._image_start_token_id = None
-            self._image_end_token_id = None
-
     async def __call__(self, payload: StagePayload) -> StagePayload:
         """Process a chat completion request into pipeline state."""
         request = payload.request
@@ -340,26 +340,20 @@ async def __call__(self, payload: StagePayload) -> StagePayload:
         gen_mask = None
 
         if is_image_gen:
-            num_query_tokens = sum(
-                s * s for s in self._conditioner.img_gen_scales
-            )
+            num_query_tokens = sum(s * s for s in self._conditioner.img_gen_scales)
 
             suffix_ids = (
                 [self._image_start_token_id]
                 + [self._image_patch_token_id] * num_query_tokens
                 + [self._image_end_token_id]
             )
             suffix_tensor = torch.tensor([suffix_ids], dtype=torch.long)
-            input_ids_tensor = torch.cat(
-                [input_ids_tensor, suffix_tensor], dim=1
-            )
+            input_ids_tensor = torch.cat([input_ids_tensor, suffix_tensor], dim=1)
             attention_mask = torch.ones_like(input_ids_tensor)
 
             # gen_mask: 0 for text, 1 for query tokens, 0 for start/end markers
             text_len = len(input_ids)
-            gen_mask = torch.zeros(
-                input_ids_tensor.shape[1], dtype=torch.long
-            )
+            gen_mask = torch.zeros(input_ids_tensor.shape[1], dtype=torch.long)
             gen_mask[text_len + 1 : text_len + 1 + num_query_tokens] = 1
 
         prompt: PromptInputs = {
 
@@ -16,8 +16,8 @@
     AGGREGATE_STAGE,
     AUDIO_STAGE,
     DECODE_STAGE,
-    IMAGE_STAGE,
     IMAGE_GEN_STAGE,
+    IMAGE_STAGE,
     PREPROCESSING_STAGE,
     TALKER_STAGE,
     THINKER_STAGE,
 
@@ -5,7 +5,6 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any
 
 import torch
 from PIL import Image
@@ -51,4 +50,3 @@ def generate(
 
     def unload(self) -> None:
         """Release GPU memory."""
-        pass
@@ -3,8 +3,10 @@
 """Bailing MoE model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 
+
 class BailingMoeV2Config(PretrainedConfig):
     model_type = "bailing_moe_v2"
+
     def __init__(
         self,
         vocab_size=30592,
@@ -84,5 +86,7 @@ def __init__(
         self.partial_rotary_factor = partial_rotary_factor
         self.router_type = router_type
         self.use_interleaved_frame_timestamp = use_interleaved_frame_timestamp
-        super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
+        super().__init__(
+            pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
         self._attn_implementation = _attn_implementation
Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,8 @@ dependencies = [`
`56`	`56`	`"torchaudio",`
`57`	`57`	`# Gradio playground`
`58`	`58`	`"gradio>=4.0.0",`
	`59`	`+ # Ming-Omni`
	`60`	`+ "diffusers==0.37.1",`
`59`	`61`	`# flash-attn: install separately via prebuilt wheel (see install instructions below)`
`60`	`62`	`]`
`61`	`63`
Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,9 @@ def thinker_forward_omni(`
`50`	`50`	`input_deepstack_embeds=ds_input,`
`51`	`51`	`)`
`52`	`52`
	`53`	`+ if getattr(forward_batch, "capture_hidden_mode", None) is not None:`
	`54`	`+ outer_model._captured_full_hidden_states = hidden_states.clone()`
	`55`	`+`
`53`	`56`	`return outer_model.logits_processor(`
`54`	`57`	`forward_batch.input_ids,`
`55`	`58`	`hidden_states,`