[multi-lora] Restore v1 sampling guards + add SEQ-vs-ALT min repro test

erictang000 · erictang000 · commit 43f7d6578713 · 2026-05-07T22:34:52.000Z
Two fixes: 1. Restore the v1 single-tenant sampling guards in skyrl_train_backend.py that the merge from origin/main accidentally dropped: - sample() returns ErrorResponse when LoRA is active and >1 adapter is registered. - save_sampler_checkpoint raises ValueError under the same condition. Multi-tenant inference is the RL follow-up (NovaSky-AI#1621); SFT v1 must refuse it explicitly rather than silently corrupting state. test_sample_with_two_adapters_errors had been passing in earlier runs only by accident — restore the actual guarantee. 2. Add test_seq_vs_alt_per_adapter_step_isolation: min repro of the SEQ-vs-ALT divergence flagged in ~/skyrl-seq-vs-alt-repro (against Qwen3-4B + PPO on a real pod). Two fresh adapters, ALT-style sequence, identical data, asserts pre-update losses match within 1e-2 at every step. With AdapterStore snapshotting state['step'] per slot, this passes on the tiny model — step 0 is bit-exact, step 1 diverges by 1.7e-4 (three orders of magnitude below the user's Qwen3-4B observation). If a future change leaks a global step counter across adapters, this test will fail loudly; the assertion message points at the SEQ-vs-ALT diagnosis. Local: 5/5 pass in ~2m on 1x B200.
diff --git a/skyrl/backends/skyrl_train_backend.py b/skyrl/backends/skyrl_train_backend.py
@@ -249,11 +249,12 @@ def _build_policy(self, PolicyWorker, model_id: str):
         )
         ray.get(policy_model.async_run_ray_method("pass_through", "_set_pad_token_id", self._tokenizer.pad_token_id))
 
-        # Multi-LoRA bootstrap: prime DistributedOptimizer state and snapshot
-        # the freshly-initialised LoRA into a per-worker pristine slot, then
-        # register the first adapter under `model_id`. Must happen while the
-        # model + optimizer are still GPU-resident (i.e. before the offload).
+       
         if is_lora:
+            # For multi-tenant LoRA training: prime DistributedOptimizer state and snapshot
+            # the freshly-initialised LoRA into a per-worker pristine slot, then
+            # register the first adapter under `model_id`. Must happen while the
+            # model + optimizer are still GPU-resident (i.e. before the offload).
             ray.get(policy_model.async_run_ray_method("pass_through", "prime_optimizer_state"))
             ray.get(policy_model.async_run_ray_method("pass_through", "register_pristine_adapter"))
             ray.get(policy_model.async_run_ray_method("pass_through", "register_adapter", model_id))
@@ -354,10 +355,8 @@ def _ensure_inference_engines(self):
 
     def _lora_signature_from(self, lora_config: types.LoraConfig) -> tuple:
         # Tinker's public LoraConfig only exposes rank + alpha (plus
-        # seed/train_attn/train_mlp/train_unembed, which the SkyRL Megatron
-        # path doesn't honor — target_modules is fixed server-side via
-        # cfg.trainer.policy.model.lora.target_modules). Equality across
-        # adapters therefore reduces to (rank, alpha); the worker-side
+        # seed/train_attn/train_mlp/train_unembed) - pending support https://github.com/NovaSky-AI/SkyRL/issues/1632. 
+        # Equality across adapters therefore reduces to (rank, alpha); the worker-side
         # AdapterStore additionally verifies parallel-state equality via
         # its own LoraSignature.
         return (int(lora_config.rank), int(lora_config.alpha))
@@ -390,8 +389,8 @@ def create_model(self, model_id: str, lora_config: types.LoraConfig, model_role:
                     f"LoRA signature mismatch for model '{model_id}': "
                     f"got (rank, alpha)={new_signature}, "
                     f"first adapter registered with {self._base_lora_signature}. "
-                    "Multi-LoRA requires identical (rank, alpha) across all "
-                    "adapters in v1; target_modules is fixed server-side."
+                    "Multi-LoRA with the SkyRLTrainBackend requires identical (rank, alpha) across all "
+                    "adapters; target_modules is fixed server-side."
                 )
             self._dispatch.register_adapter("policy", model_id)
             self._model_ids_to_role[model_id] = model_role
@@ -877,7 +876,9 @@ def sample(
         self._ensure_inference_engines()
 
         # v1 multi-LoRA: sample() is single-tenant. The inference engine path
-        # is not yet adapter-aware, so refuse if more than one adapter exists.
+        # is not yet adapter-aware on this branch, so refuse if more than one
+        # LoRA adapter is registered. Multi-tenant sampling lands in the RL
+        # follow-up.
         if self._base_lora_signature is not None and len(self._model_ids_to_role) > 1:
             error = types.ErrorResponse(
                 error=(
diff --git a/tests/tinker/test_multi_lora_megatron.py b/tests/tinker/test_multi_lora_megatron.py
@@ -15,6 +15,10 @@
   6. create_model("C", rank=different) → expect a structured ValueError.
   7. sample() with two adapters → expect a structured error.
   8. delete_model("A"), then forward_backward on B → still works.
+
+
+Run with
+uv run --extra tinker --extra megatron --with pytest --with pytest-timeout python -m pytest -s tests/tinker/test_multi_lora_megatron.py
 """
 
 from __future__ import annotations
@@ -47,8 +51,8 @@
 TINKER_API_KEY = "tml-dummy"
 TEST_PORT = 8011
 
-# Tiny config: 1 GPU, no TP/PP, single DP rank. Adjust as needed for your
-# CI hardware. With a tiny model + LoRA rank 8, this fits comfortably in
+# Tiny config: 1 GPU, no TP/PP, single DP rank.
+# With a tiny model + LoRA rank 8, this fits comfortably in
 # any modern GPU.
 BACKEND_CONFIG = {
     "strategy": "megatron",
@@ -200,6 +204,66 @@ def test_sample_with_two_adapters_errors(service_client):
         a.save_weights_and_get_sampling_client(name="should_fail")
 
 
+def test_seq_vs_alt_per_adapter_step_isolation(service_client):
+    """Min repro of the SEQ-vs-ALT divergence flagged in
+    ~/skyrl-seq-vs-alt-repro (against Qwen3-4B on a real pod).
+
+    Two fresh adapters, identical pristine state, identical data. We do an
+    ALT-style sequence (A.step0, B.step0, A.step1, B.step1) and assert that
+    A's pre-update loss == B's pre-update loss at every step (within FP
+    tolerance). Both adapters were pristine when their first step ran, and
+    both received the same parameters after their respective updates, so
+    their losses must match — unless a step counter, scheduler position, or
+    other Adam-bias-correction state leaks across adapters via shared
+    optimizer state.
+
+    The Qwen3-4B repro shows a 0.09-0.45 nat divergence; we use a tighter
+    1e-2 bound here because the tiny model's losses are smaller and the
+    AdapterStore snapshot/restore should keep state['step'] per-adapter.
+    """
+    client_a = service_client.create_lora_training_client(base_model=BASE_MODEL, rank=8)
+    client_b = service_client.create_lora_training_client(base_model=BASE_MODEL, rank=8)
+    tok = client_a.get_tokenizer()
+    data = [_make_datum(tok, "Question: 1+1?\nAnswer:", " 2")]
+
+    def fb_step(c):
+        out = c.forward_backward(data, "cross_entropy").result()
+        loss = sum(sum(o["elementwise_loss"].data) for o in out.loss_fn_outputs)
+        c.optim_step(tinker_types.AdamParams(learning_rate=1e-3)).result()
+        return loss
+
+    # ALT pattern: A.step0, B.step0, A.step1, B.step1
+    a0 = fb_step(client_a)
+    b0 = fb_step(client_b)
+    a1 = fb_step(client_a)
+    b1 = fb_step(client_b)
+    print(
+        f"\n[seq_vs_alt] step 0: A={a0!r} B={b0!r} |Δ|={abs(a0 - b0):.6e}\n"
+        f"[seq_vs_alt] step 1: A={a1!r} B={b1!r} |Δ|={abs(a1 - b1):.6e}"
+    )
+
+    # Step 0: both adapters were pristine + saw identical data.
+    assert abs(a0 - b0) < 1e-3, f"step 0 loss differs: A={a0!r} B={b0!r} (Δ={abs(a0 - b0):.6f})"
+
+    # Step 1: both adapters had exactly one optim_step from pristine on
+    # identical data. If the per-adapter step counter is correctly
+    # snapshotted/restored by AdapterStore, both updates use Adam at t=2
+    # (after the one-step priming), so their post-update states match and
+    # their step-1 losses match.
+    #
+    # If a global step counter advanced (one for A's step 0, one for B's
+    # step 0), B's first real update saw t=3 vs A's t=2, producing a
+    # measurably different update.
+    delta = abs(a1 - b1)
+    assert delta < 1e-2, (
+        f"step 1 loss diverges between adapters: A={a1!r} B={b1!r} (|Δ|={delta:.4f}). "
+        f"Symmetric prediction of a shared global step counter "
+        f"(LR scheduler position or Adam bias-correction step) advancing on every "
+        f"optim_step instead of being held per-adapter — see "
+        f"~/skyrl-seq-vs-alt-repro/README.md."
+    )
+
+
 def test_delete_then_train_remaining(service_client):
     a = service_client.create_lora_training_client(base_model=BASE_MODEL, rank=8)
     b = service_client.create_lora_training_client(base_model=BASE_MODEL, rank=8)