erictang000
diff --git a/‎skyrl-agent/skyrl_agent/integrations/skyrl_train/skyrl_train_backend.py‎
Lines changed: 10 additions & 2 deletions b/‎skyrl-agent/skyrl_agent/integrations/skyrl_train/skyrl_train_backend.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎skyrl/backends/skyrl_train/inference_servers/remote_inference_client.py‎
Lines changed: 116 additions & 39 deletions b/‎skyrl/backends/skyrl_train/inference_servers/remote_inference_client.py‎
Lines changed: 116 additions & 39 deletions
@@ -1,18 +1,26 @@
 from typing import Any, List
+
+from skyrl.backends.skyrl_train.inference_servers.utils import resolve_policy_model_name
+
 from ..base import AsyncInferBackend, GeneratorOutput, GeneratorInput
 
 
 class SkyRLBackend(AsyncInferBackend):
     def __init__(self, infer_engine, tokenizer: Any = None, cfg: Any = None):
         self.client = infer_engine
+        # Resolve the name the inference engine knows the policy by (base
+        # model or registered LoRA adapter) once at construction. Threaded
+        # into every ``client.generate`` call so the data plane never has
+        # to guess the target adapter.
+        self.policy_model_name = resolve_policy_model_name(cfg) if cfg is not None else self.client.model_name
 
     async def async_generate_prompts(self, prompts: Any, sampling_params: Any, **kwargs) -> List[str]:
         input_obj = {
             "prompts": [prompts],
             "session_ids": [kwargs.get("request_id", None)],
             "sampling_params": sampling_params,
         }
-        output = await self.client.generate(input_obj)
+        output = await self.client.generate(input_obj, model=self.policy_model_name)
         return output["responses"][0], output["stop_reasons"][0]
 
     async def async_generate_ids(self, input_ids: List[int], sampling_params: Any, **kwargs) -> List[str]:
@@ -21,7 +29,7 @@ async def async_generate_ids(self, input_ids: List[int], sampling_params: Any, *
             "session_ids": [kwargs.get("request_id", None)],
             "sampling_params": sampling_params,
         }
-        output = await self.client.generate(input_obj)
+        output = await self.client.generate(input_obj, model=self.policy_model_name)
         # todo(@csy) probably need to be finish_reason
         # https://github.com/vllm-project/vllm/blob/a0f8a7964694a6077689b242b5eca95de392d4bb/vllm/v1/engine/__init__.py#L22
         meta_info = {
 
@@ -79,6 +79,9 @@
 
 _DATA_PLANE_RETRIES = 30
 
+SKYRL_LORA_ADAPTER_NAME = "skyrl-lora"
+"""Default LoRA adapter name used for single-LoRA training inside SkyRL."""
+
 _TINKER_SAMPLE_TO_VLLM_PARAM_MAP = {
     "temperature": "temperature",
     "max_tokens": "max_tokens",
@@ -192,14 +195,20 @@ class RemoteInferenceClient:
     reports the full DP world size per server, so we divide by num_deployments."""
 
     model_name: str = "default"
-    """Model name for OpenAI-compatible API calls."""
+    """The base model identifier the inference server was started with.
+
+    Always the base model — never a LoRA adapter name. LoRA adapters are
+    addressed by the names callers register them under via
+    ``load_lora_adapter(name, path)``, and per-call routing is done by
+    passing that name as ``model`` on the data-plane methods.
+
+    Used internally only by ``tokenize``/``detokenize``, which are LoRA-
+    agnostic but still require a ``model`` field per the OpenAI schema.
+    """
 
     enable_return_routed_experts: bool = False
     """Whether to return routed expert indices (R3 / rollout router replay)."""
 
-    active_lora_name: Optional[str] = None
-    """Name of the active LoRA adapter. If set, generation requests use this adapter instead of the base model."""
-
     uses_lora_weight_sync: bool = False
     """True when the trainer syncs LoRA adapters (rather than full/merged weights). When True,
     `sleep()` is forced to level=1: level=2 discards the base model from VRAM with no CPU backup,
@@ -317,9 +326,30 @@ async def _post(self, url: str, json: Dict[str, Any], headers: Optional[Dict[str
     # Data Plane
     # ---------------------------
 
+    def _resolve_model(self, model: Optional[str], method_name: str) -> str:
+        """Pick the target model name for a data-plane call.
+
+        - If ``model`` is non-empty, use it as-is.
+        - Otherwise, when LoRA is in use (``uses_lora_weight_sync=True``) raise
+          ``ValueError`` — the caller must name the adapter explicitly because
+          falling back to the base model would silently bypass LoRA.
+        - Otherwise return ``self.model_name`` (the base model the server was
+          started with).
+        """
+        if model:
+            return model
+        if self.uses_lora_weight_sync:
+            raise ValueError(
+                f"RemoteInferenceClient.{method_name}: `model` is required when LoRA "
+                f"is enabled (uses_lora_weight_sync=True). Pass the LoRA adapter name "
+                f"explicitly so the request doesn't silently target the base model."
+            )
+        return self.model_name
+
     async def generate(
         self,
         input_batch: InferenceEngineInput,
+        model: Optional[str] = None,
     ) -> InferenceEngineOutput:
         """
         Generate completions via /v1/completions.
@@ -335,10 +365,14 @@ async def generate(
 
         Args:
             input_batch: Contains prompt_token_ids, sampling_params, and optional session_ids.
+            model: Optional model identifier — the base model name or a loaded
+                LoRA adapter name. When omitted, defaults to ``self.model_name``
+                if LoRA is not in use; raises ``ValueError`` if it is.
 
         Returns:
             InferenceEngineOutput with responses, response_ids, and stop_reasons.
         """
+        model = self._resolve_model(model, "generate")
 
         prompt_token_ids = input_batch.get("prompt_token_ids")
         if prompt_token_ids is None:
@@ -373,13 +407,15 @@ async def _throttled_generate(idx: int) -> Dict[str, Any]:
                     sampling_params=sampling_params,
                     session_id=session_ids[idx] if session_ids and idx < len(session_ids) else None,
                     mm_features=mm_features[idx] if mm_features and idx < len(mm_features) else None,
+                    model=model,
                 )
             async with gen_sem:
                 return await self._generate_single(
                     prompt_token_ids=prompt_token_ids[idx],
                     sampling_params=sampling_params,
                     session_id=session_ids[idx] if session_ids and idx < len(session_ids) else None,
                     mm_features=mm_features[idx] if mm_features and idx < len(mm_features) else None,
+                    model=model,
                 )
 
         async def _throttled_detokenize(token_ids: List[int]) -> str:
@@ -407,6 +443,7 @@ async def _generate_single(
         prompt_token_ids: List[int],
         sampling_params: Dict[str, Any],
         session_id: Optional[Any],
+        model: str,
         mm_features: Optional[MultiModalFeatures] = None,
     ) -> Dict[str, Any]:
         """
@@ -425,12 +462,9 @@ async def _generate_single(
             else f"{self.proxy_url}/inference/v1/generate"
         )
 
-        # Use LoRA adapter name if one is active, otherwise use base model name
-        effective_model = self.active_lora_name if self.active_lora_name else self.model_name
-
         payload: dict[str, Any] = {
             "sampling_params": sampling_params,
-            "model": effective_model,
+            "model": model,
             "token_ids": prompt_token_ids,
         }
         if mm_features:
@@ -466,6 +500,7 @@ async def _render_for_sample(
         self,
         prompt: Dict[str, Any],
         session_id: Optional[str],
+        model: str,
     ) -> Tuple[List[int], Optional[MultiModalFeatures]]:
         """Build token_ids and optional multi-modal features from a Tinker prompt.
 
@@ -497,10 +532,9 @@ async def _render_for_sample(
                 url = c["location"]
             content_parts.append({"type": "image_url", "image_url": {"url": url}})
 
-        effective_model = self.active_lora_name if self.active_lora_name else self.model_name
         render_payload: Dict[str, Any] = {
             "json": {
-                "model": effective_model,
+                "model": model,
                 "messages": [{"role": "user", "content": content_parts}],
             }
         }
@@ -548,7 +582,10 @@ async def _render_for_sample(
 
         return final_token_ids, adjusted_features
 
-    async def sample(self, request_payload: SampleRequestPayload) -> SampleResponse:
+    async def sample(
+        self,
+        request_payload: SampleRequestPayload,
+    ) -> SampleResponse:
         """
         Sample completions via /inference/v1/generate (Tinker API).
 
@@ -557,13 +594,16 @@ async def sample(self, request_payload: SampleRequestPayload) -> SampleResponse:
 
         Args:
             request_payload: SampleRequestPayload with {"json": <request-body>}.
-                Expected keys in json: prompt, num_samples, sampling_params, session_id,
-                include_prompt_logprobs (bool), topk_prompt_logprobs (int).
+                Expected keys in json: prompt, num_samples, sampling_params,
+                session_id, include_prompt_logprobs (bool), topk_prompt_logprobs (int).
+                ``model`` is optional and resolved via ``_resolve_model``.
 
         Returns:
             SampleResponse with type="sample", sequences list, prompt_logprobs, and topk_prompt_logprobs.
         """
         session_id, body = _extract_session_id_and_body(request_payload)
+        model = self._resolve_model(body.get("model"), "sample")
+        body["model"] = model
 
         prompt = body.get("prompt", {})
         num_samples = body.get("num_samples", 1)
@@ -581,7 +621,7 @@ async def sample(self, request_payload: SampleRequestPayload) -> SampleResponse:
 
         # Render prompt: flatten text tokens and, if images are present,
         # call the render endpoint to get placeholder tokens + features.
-        token_ids, mm_features = await self._render_for_sample(prompt, session_id)
+        token_ids, mm_features = await self._render_for_sample(prompt, session_id, model=model)
 
         # Map Tinker SamplingParams → vLLM format
         sampling_params: Dict[str, Any] = {
@@ -596,11 +636,9 @@ async def sample(self, request_payload: SampleRequestPayload) -> SampleResponse:
             if val is not None:
                 sampling_params[vllm_key] = val
 
-        effective_model = self.active_lora_name if self.active_lora_name else self.model_name
-
         payload: Dict[str, Any] = {
             "sampling_params": sampling_params,
-            "model": effective_model,
+            "model": model,
             "token_ids": token_ids,
         }
         if mm_features is not None:
@@ -679,13 +717,17 @@ async def chat_completion(
 
         Args:
             request_payload: Dict with {"json": <request-body>, "headers": <headers-dict>}.
-                The request body should be OpenAI-compatible chat completion request.
-                session_id can be included in json for consistent routing.
+                The request body must be an OpenAI-compatible chat completion
+                request. ``model`` is optional and resolved via
+                ``_resolve_model``; if omitted the body is mutated to inject the
+                resolved value before forwarding to vLLM. ``session_id`` can be
+                included in the body for consistent routing.
 
         Returns:
             OpenAI-compatible chat completion response.
         """
         session_id, body = _extract_session_id_and_body(request_payload)
+        body["model"] = self._resolve_model(body.get("model"), "chat_completion")
 
         headers = {"Content-Type": "application/json"}
         if session_id:
@@ -708,13 +750,16 @@ async def render_chat_completion(
 
         Args:
             request_payload: Dict with {"json": <request-body>}.
-                The request body should be OpenAI-compatible chat completion request.
-                session_id can be included in json for consistent routing.
+                The request body should be OpenAI-compatible chat completion
+                request. ``model`` is optional and resolved via
+                ``_resolve_model``. session_id can be included in json for
+                consistent routing.
 
         Returns:
             Rendered chat completion response (template-applied prompt and token IDs).
         """
         session_id, body = _extract_session_id_and_body(request_payload)
+        body["model"] = self._resolve_model(body.get("model"), "render_chat_completion")
 
         headers = {"Content-Type": "application/json"}
         if session_id:
@@ -737,13 +782,16 @@ async def completion(
 
         Args:
             request_payload: Dict with {"json": <request-body>, "headers": <headers-dict>}.
-                The request body should be OpenAI-compatible completion request.
-                session_id can be included in json for consistent routing.
+                The request body should be OpenAI-compatible completion
+                request. ``model`` is optional and resolved via
+                ``_resolve_model``. session_id can be included in json for
+                consistent routing.
 
         Returns:
             OpenAI-compatible completion response.
         """
         session_id, body = _extract_session_id_and_body(request_payload)
+        body["model"] = self._resolve_model(body.get("model"), "completion")
 
         headers = {"Content-Type": "application/json"}
         if session_id:
@@ -1010,7 +1058,7 @@ async def update_named_weights(
         """
         Update model weights via vLLM native /update_weights. Used for full parameter fine-tuning.
 
-        For LoRA weight sync, use update_lora_from_disk() instead.
+        For LoRA weight sync, use load_lora_adapter() instead.
 
         Args:
             update_info: Dict with keys expected by vLLM (names, dtype_names, shapes, packed, etc.)
@@ -1095,20 +1143,21 @@ async def finish_weight_update(self) -> Dict[str, Any]:
             {"method": "finish_weight_update"},
         )
 
-    async def update_lora_from_disk(
+    async def load_lora_adapter(
         self,
+        lora_name: str,
         lora_path: str,
     ) -> Dict[str, Any]:
         """
-        Update LoRA adapter weights by loading from disk on all backend servers
-        via the SkyRL custom /skyrl/v1/load_lora_adapter endpoint.
+        Load (or reload) a LoRA adapter on all backend servers via the SkyRL
+        custom /skyrl/v1/load_lora_adapter endpoint.
 
-        Always loads under self.active_lora_name so the same lora_int_id slot
-        is reused across weight syncs.
+        After loading, generation/chat/completion requests can target this LoRA
+        by passing ``model=lora_name``.
 
-        TODO(aaron): switch back to /v1/load_lora_adapter (with "load_inplace": True)
-        once the upstream fix in https://github.com/vllm-project/vllm/pull/41482
-        lands in a vLLM release we depend on.
+        TODO(aaron): switch back to vLLM's /v1/load_lora_adapter once the
+        upstream fix in https://github.com/vllm-project/vllm/pull/41482 lands
+        in a vLLM release we depend on.
 
         The custom endpoint (defined in vllm_server_actor.py) wraps add_lora
         with load_inplace=True (so the engine reloads the freshly-written
@@ -1118,19 +1167,13 @@ async def update_lora_from_disk(
         num_engines>=2 — see vllm_server_actor.py:_skyrl_load_lora_adapter for
         the detailed explanation.
 
-        After loading, generation requests will automatically use the LoRA
-        adapter by setting the model name to the LoRA adapter name.
-
         Args:
+            lora_name: Name to register the adapter under on each server.
             lora_path: Path to the LoRA adapter on disk (must be accessible from servers).
 
         Returns:
             Dict mapping server_url to response.
         """
-        if self.active_lora_name is None:
-            raise ValueError("active_lora_name must be set on RemoteInferenceClient before loading a LoRA adapter.")
-
-        lora_name = self.active_lora_name
         session = await self._get_session()
 
         async def _load_on_server(server_url: str):
@@ -1148,6 +1191,40 @@ async def _load_on_server(server_url: str):
 
         return {url: resp for url, resp in results}
 
+    async def unload_lora_adapter(self, lora_name: str) -> Dict[str, Any]:
+        """
+        Unload a previously-loaded LoRA adapter on all backend servers via /v1/unload_lora_adapter.
+
+        After unloading, ``lora_name`` is no longer accepted as a ``model``
+        target on any server. The underlying CPU/GPU LRU entries on vLLM age
+        out naturally as new adapters are loaded.
+
+        Args:
+            lora_name: Name of the adapter to unload.
+
+        Returns:
+            Dict mapping server_url to response.
+        """
+        payload = {"lora_name": lora_name}
+
+        # Mirror load_lora_adapter: vLLM returns plain text on success and JSON
+        # ErrorResponse (e.g. 404) on failure.
+        session = await self._get_session()
+
+        async def _unload_on_server(server_url: str):
+            url = f"{server_url}/v1/unload_lora_adapter"
+            async with session.post(url, json=payload) as resp:
+                if resp.status >= 400:
+                    body = await resp.json()
+                    raise_for_status(resp, body)
+                return server_url, {"status": resp.status, "body": await resp.text()}
+
+        results = await asyncio.gather(*[_unload_on_server(url) for url in self.server_urls])
+
+        logger.info(f"Unloaded LoRA adapter '{lora_name}'")
+
+        return {url: resp for url, resp in results}
+
     # ---------------------------
     # Info
     # ---------------------------