Merge branch 'InternLM:main' into add_k4v2

zhulinJulia24 · web-flow · commit 2cd0d9fa943e · 2026-05-27T14:24:33.000+08:00
diff --git a/lmdeploy/pytorch/disagg/conn/engine_conn.py b/lmdeploy/pytorch/disagg/conn/engine_conn.py
@@ -80,7 +80,7 @@ async def handle_zmq_recv(self, remote_engine_id: str):
             if isinstance(req, DistServeCacheFreeRequest):
                 session_id = req.remote_session_id
                 if session_id in self.engine.scheduler.sessions:
-                    self.engine.scheduler.end_session(session_id=session_id)
+                    self.engine.end_session(session_id=session_id)
                 else:
                     logger.error(f'invalid free, {remote_engine_id}, {session_id}')
             else:
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
+import ctypes
 import gc
 import os
 from dataclasses import dataclass
@@ -9,6 +10,7 @@
 import torch
 
 from lmdeploy.messages import PytorchEngineConfig, RequestMetrics, ResponseType, SpeculativeConfig
+from lmdeploy.pytorch import envs as _envs
 from lmdeploy.pytorch.disagg.config import EngineRole
 from lmdeploy.pytorch.disagg.conn.engine_conn import EngineP2PConnection
 from lmdeploy.pytorch.disagg.conn.protocol import (
@@ -188,6 +190,8 @@ def __init__(
         # infer sleeping from empty_init: empty_init still builds runtime
         # resources and has its own weight-update workflow.
         self._sleeping_tags = set()
+        self._multimodal_session_trim_count = max(0, _envs.multimodal_session_trim_count)
+        self._multimodal_session_end_count = 0
 
         # create main thread
         self.req_manager.set_main_loop_func(self.async_loop)
@@ -318,6 +322,37 @@ def _on_stop_session(self, reqs: list[Request], **kwargs):
             if resp:
                 self._response(req.resp, resp_type)
 
+    @staticmethod
+    def _try_mem_trim():
+        """Try to trim memory."""
+        try:
+            gc.collect()
+            ctypes.CDLL('libc.so.6').malloc_trim(0)
+        except Exception as e:
+            logger.debug(f'Memory trim failed: {e}')
+
+    @staticmethod
+    def _has_multimodal_session(session) -> bool:
+        """Check whether session has multimodal history."""
+        for seq in session.sequences.values():
+            history_multimodals = getattr(seq, 'history_multimodals', None)
+            if history_multimodals is not None and not history_multimodals.empty():
+                return True
+        return False
+
+    def _maybe_trim_multimodal_session(self, has_multimodal: bool):
+        """Trim host memory after enough multimodal sessions have ended."""
+        trim_count = getattr(self, '_multimodal_session_trim_count', max(0, _envs.multimodal_session_trim_count))
+        if not has_multimodal or trim_count <= 0:
+            return
+
+        self._multimodal_session_end_count = getattr(self, '_multimodal_session_end_count', 0) + 1
+        if self._multimodal_session_end_count < trim_count:
+            return
+
+        self._multimodal_session_end_count = 0
+        self._try_mem_trim()
+
     def _on_end_session(self, reqs: list[Request], **kwargs):
         """On end session callback."""
         for req in reqs:
@@ -598,7 +633,9 @@ def start_loop(self):
     def end_session(self, session_id: int):
         """End session."""
         if session_id in self.scheduler.sessions:
+            has_multimodal = self._has_multimodal_session(self.scheduler.sessions[session_id])
             self.scheduler.end_session(session_id)
+            self._maybe_trim_multimodal_session(has_multimodal)
             return True
         return False
 
diff --git a/lmdeploy/pytorch/engine/mp_engine/zmq_engine.py b/lmdeploy/pytorch/engine/mp_engine/zmq_engine.py
@@ -97,6 +97,13 @@ def _mp_proc(
 
         from .zmq_rpc import AsyncRPCServer
 
+        # try rename the process
+        try:
+            import ctypes
+            ctypes.CDLL(None).prctl(15, b'ZMQMPEngine', 0, 0, 0)
+        except Exception as e:
+            logger.debug(f'Failed to rename MPEngine process: {e}')
+
         logger.setLevel(log_level)
 
         # create an async rpc server
diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py
@@ -157,6 +157,9 @@ def _patched_get_env(
     # model agent
     skip_warmup = env_to_bool('LMDEPLOY_SKIP_WARMUP', False)
 
+    # memory trim
+    multimodal_session_trim_count = env_to_int('LMDEPLOY_MULTIMODAL_SESSION_TRIM_COUNT', 128)
+
     # model format
     scale_fmt = os.getenv('LMDEPLOY_SCALE_FMT', None)
 
diff --git a/lmdeploy/turbomind/models/internvl.py b/lmdeploy/turbomind/models/internvl.py
@@ -15,6 +15,17 @@ def _cfg_get(cfg, name: str, default=None):
     return getattr(cfg, name, default)
 
 
+def map_interns1_hf_keys(name: str) -> str:
+    """Map Intern-S1 HF VLM checkpoint keys to the Qwen3 text loader layout."""
+    language_model_prefix = 'model.language_model.'
+    if name.startswith(language_model_prefix):
+        suffix = name[len(language_model_prefix):]
+        return f'language_model.model.{suffix}'
+    if name.startswith('lm_head.'):
+        return f'language_model.{name}'
+    return name
+
+
 @INPUT_MODELS.register_module(name='internvl')
 class InternVLModel:
     """Aggregate source model for InternVL checkpoints with any registered text
@@ -42,6 +53,10 @@ def __init__(self, cfg: PretrainedConfig, *, resolver):
 
         text_model_cls = INPUT_MODELS.get(text_model_registered_name)
         self.text_model = text_model_cls(llm_cfg, resolver=resolver)
+        archs = _cfg_get(cfg, 'architectures') or []
+        self._checkpoint_mappings = []
+        if archs and archs[0] == 'InternS1ForConditionalGeneration':
+            self._checkpoint_mappings.append(map_interns1_hf_keys)
         self.vision_model = None
 
     def bind_runtime(self, *, ctx, root_handles,
@@ -60,7 +75,7 @@ def _vocab_size(self):
 
     @property
     def _loader_mappings(self):
-        return list(getattr(type(self.text_model), '_loader_mappings', []))
+        return self._checkpoint_mappings + list(getattr(type(self.text_model), '_loader_mappings', []))
 
     def model(self, pfx):
         self.text_model.model(pfx + 'language_model')