Minor improvements to make RL workflow more robust (#319)

ultmaster · web-flow · commit b3cb5e1337ce · 2025-11-18T15:40:51.000+08:00
diff --git a/.github/workflows/examples-calc-x.yml b/.github/workflows/examples-calc-x.yml
@@ -239,6 +239,14 @@ jobs:
           WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
         id: calc_x_train_local_model
 
+      - name: Validate training with local model
+        run: |
+          set -ex
+          uv run scripts/validate_example_wandb.py ${{ steps.calc_x_train_local_model.outputs.project_name }} ${{ steps.calc_x_train_local_model.outputs.run_name }}
+        env:
+          WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
+          WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
+
       - name: Training with LLM Proxy
         run: |
           set -ex
@@ -254,6 +262,14 @@ jobs:
           WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
         id: calc_x_train_llm_proxy
 
+      - name: Validate training with LLM Proxy
+        run: |
+          set -ex
+          uv run scripts/validate_example_wandb.py ${{ steps.calc_x_train_llm_proxy.outputs.project_name }} ${{ steps.calc_x_train_llm_proxy.outputs.run_name }}
+        env:
+          WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
+          WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
+
       - name: Training with external store
         run: |
           set -euo pipefail
@@ -284,6 +300,14 @@ jobs:
           WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
         id: calc_x_train_external_store
 
+      - name: Validate training with external store
+        run: |
+          set -ex
+          uv run scripts/validate_example_wandb.py ${{ steps.calc_x_train_external_store.outputs.project_name }} ${{ steps.calc_x_train_external_store.outputs.run_name }}
+        env:
+          WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
+          WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
+
       - name: Training with role-based environment variables
         run: |
           set -euo pipefail
@@ -305,3 +329,12 @@ jobs:
         env:
           WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
           WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
+        id: calc_x_train_role_based_env_var
+
+      - name: Validate training with role-based environment variables
+        run: |
+          set -ex
+          uv run scripts/validate_example_wandb.py ${{ steps.calc_x_train_role_based_env_var.outputs.project_name }} ${{ steps.calc_x_train_role_based_env_var.outputs.run_name }}
+        env:
+          WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
+          WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
diff --git a/.github/workflows/examples-spider.yml b/.github/workflows/examples-spider.yml
@@ -121,7 +121,7 @@ jobs:
       - name: Validate Spider training
         run: |
           set -ex
-          uv run scripts/validate_example_wandb.py ${{ steps.spider_train.outputs.project_name }} ${{ steps.spider_train.outputs.run_name }}
+          uv run scripts/validate_example_wandb.py ${{ steps.spider_train.outputs.project_name }} ${{ steps.spider_train.outputs.run_name }} --reward-tolerance 5
         env:
           WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
           WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
diff --git a/agentlightning/execution/client_server.py b/agentlightning/execution/client_server.py
@@ -71,6 +71,7 @@ def __init__(
         terminate_timeout: float = 10.0,
         main_process: Literal["algorithm", "runner"] = "algorithm",
         managed_store: bool | None = None,
+        allowed_exit_codes: Iterable[int] = (0, -15),
     ) -> None:
         """Configure the strategy.
 
@@ -94,6 +95,9 @@ def __init__(
                 LightningStore client/server wrappers automatically. When
                 `False` the provided `store` is passed directly to the
                 bundles, allowing callers to manage store wrappers manually.
+            allowed_exit_codes: Allowed exit codes for subprocesses.
+                By default, runner can exit gracefully with code 0 or terminated
+                by SIGTERM (-15).
         """
         if role is None:
             role_env = os.getenv("AGL_CURRENT_ROLE")
@@ -133,6 +137,7 @@ def __init__(
                 raise ValueError("main_process='runner' requires n_runners to be 1")
         self.main_process = main_process
         self.managed_store = resolve_managed_store_flag(managed_store)
+        self.allowed_exit_codes = tuple(allowed_exit_codes)
 
     async def _execute_algorithm(
         self, algorithm: AlgorithmBundle, store: LightningStore, stop_evt: ExecutionEvent
@@ -338,10 +343,10 @@ def _shutdown_processes(
 
     def _check_process_exitcodes(self, processes: Iterable[multiprocessing.Process]) -> None:
         """Raise an error if any managed process exited with a non-zero status."""
-        failed = [p for p in processes if p.exitcode not in (0, None)]
+        failed = [p for p in processes if p.exitcode not in self.allowed_exit_codes + (None,)]
         if failed:
             formatted = ", ".join(f"{p.name or p.pid} (exitcode={p.exitcode})" for p in failed)
-            raise RuntimeError(f"Subprocesses failed: {formatted}")
+            raise RuntimeError(f"Subprocesses failed with unexpected exit codes: {formatted}")
 
     def execute(self, algorithm: AlgorithmBundle, runner: RunnerBundle, store: LightningStore) -> None:
         logger.info(
diff --git a/agentlightning/runner/agent.py b/agentlightning/runner/agent.py
@@ -11,6 +11,7 @@
 
 import asyncio
 import logging
+import random
 import threading
 import time
 from contextlib import suppress
@@ -72,6 +73,7 @@ def __init__(
         max_rollouts: Optional[int] = None,
         poll_interval: float = 5.0,
         heartbeat_interval: float = 10.0,
+        interval_jitter: float = 0.1,
         heartbeat_launch_mode: Literal["asyncio", "thread"] = "asyncio",
     ) -> None:
         """Initialize the agent runner.
@@ -82,6 +84,9 @@ def __init__(
                 [`iter`][agentlightning.LitAgentRunner.iter].
             poll_interval: Seconds to wait between store polls when no work is available.
             heartbeat_interval: Seconds to wait between sending heartbeats to the store.
+            interval_jitter: Jitter factor for the poll interval. The actual interval will be between
+                poll_interval - interval_jitter and poll_interval + interval_jitter.
+                This is to avoid the overload caused by the synchronization of the runners.
             heartbeat_launch_mode: Launch mode for the heartbeat loop. Can be "asyncio" or "thread".
                 "asyncio" is the default and recommended mode. Use "thread" if you are experiencing blocking coroutines.
         """
@@ -90,7 +95,9 @@ def __init__(
         self._max_rollouts = max_rollouts
         self._poll_interval = poll_interval
         self._heartbeat_interval = heartbeat_interval
+        self._interval_jitter = interval_jitter
         self._heartbeat_launch_mode = heartbeat_launch_mode
+        self._random_state = random.Random()
 
         # Set later
         self._agent: Optional[LitAgent[T_task]] = None
@@ -360,7 +367,11 @@ async def heartbeat_loop() -> None:
                 while not stop_event.is_set():
                     await self._emit_heartbeat(store)
                     with suppress(asyncio.TimeoutError):
-                        await asyncio.wait_for(stop_event.wait(), timeout=self._heartbeat_interval)
+                        interval = self._heartbeat_interval + self._random_state.uniform(
+                            -self._interval_jitter, self._interval_jitter
+                        )
+                        interval = max(interval, 0.01)
+                        await asyncio.wait_for(stop_event.wait(), timeout=interval)
 
             task = asyncio.create_task(heartbeat_loop(), name=f"{self.get_worker_id()}-heartbeat")
 
@@ -379,7 +390,11 @@ def thread_worker() -> None:
                 asyncio.set_event_loop(loop)
                 while not stop_evt.is_set():
                     loop.run_until_complete(self._emit_heartbeat(store))
-                    stop_evt.wait(self._heartbeat_interval)
+                    interval = self._heartbeat_interval + self._random_state.uniform(
+                        -self._interval_jitter, self._interval_jitter
+                    )
+                    interval = max(interval, 0.01)
+                    stop_evt.wait(interval)
 
             thread = threading.Thread(target=thread_worker, name=f"{self.get_worker_id()}-heartbeat", daemon=True)
             thread.start()
@@ -402,11 +417,13 @@ async def _sleep_until_next_poll(self, event: Optional[ExecutionEvent] = None) -
             event: Optional [`ExecutionEvent`][agentlightning.ExecutionEvent] object that can be used to interrupt the sleep.
                 If set during the sleep period, the method returns immediately.
         """
+        interval = self._poll_interval + self._random_state.uniform(-self._interval_jitter, self._interval_jitter)
+        interval = max(interval, 0.01)
         if event is None:
-            await asyncio.sleep(self._poll_interval)
+            await asyncio.sleep(interval)
             return
         current_time = time.time()
-        next_time = current_time + self._poll_interval
+        next_time = current_time + interval
         while time.time() < next_time:
             await asyncio.sleep(0.1)
             if event.is_set():
diff --git a/agentlightning/store/client_server.py b/agentlightning/store/client_server.py
@@ -1131,6 +1131,7 @@ def __getstate__(self):
         are excluded as they should not be transferred between processes.
         """
         return {
+            "server_address_root": self.server_address_root,
             "server_address": self.server_address,
             "_retry_delays": self._retry_delays,
             "_health_retry_delays": self._health_retry_delays,
@@ -1145,6 +1146,7 @@ def __setstate__(self, state: Dict[str, Any]):
         Replicating `__init__` logic to create another client instance in the subprocess.
         """
         self.server_address = state["server_address"]
+        self.server_address_root = state["server_address_root"]
         self._sessions = {}
         self._lock = threading.Lock()
         self._retry_delays = state["_retry_delays"]
diff --git a/agentlightning/utils/server_launcher.py b/agentlightning/utils/server_launcher.py
@@ -15,7 +15,7 @@
 from contextlib import asynccontextmanager, suppress
 from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
-from typing import Any, AsyncContextManager, AsyncIterator, Dict, Literal, Optional
+from typing import Any, AsyncContextManager, AsyncIterator, Dict, Literal, Optional, cast
 
 import aiohttp
 import requests
@@ -65,6 +65,8 @@ class PythonServerLauncherArgs:
     """The timeout to wait for the thread to join."""
     process_join_timeout: float = 10.0
     """The timeout to wait for the process to join."""
+    timeout_keep_alive: int = 30
+    """The timeout to keep the connection alive."""
 
 
 @dataclass
@@ -650,7 +652,7 @@ def __getstate__(self):
 
     def __setstate__(self, state: Dict[str, Any]):
         self.app = state["app"]
-        self.args = state["args"]
+        self.args = cast(PythonServerLauncherArgs, state["args"])
         self.serve_context = state["serve_context"]
         self._host = state["_host"]
         self._port = state["_port"]
@@ -796,6 +798,7 @@ def _create_uvicorn_server(self) -> uvicorn.Server:
             log_level=self.args.log_level,
             access_log=self.args.access_log,
             loop="asyncio",
+            timeout_keep_alive=self.args.timeout_keep_alive,
         )
         return uvicorn.Server(config)
 
diff --git a/agentlightning/verl/daemon.py b/agentlightning/verl/daemon.py
@@ -18,14 +18,12 @@
 from tensordict import TensorDict
 from verl import DataProto
 
-from agentlightning import LLM, AgentLightningServer, NamedResources, RolloutLegacy, setup_logging
+from agentlightning import LLM, AgentLightningServer, NamedResources, RolloutLegacy
 from agentlightning.adapter.triplet import TracerTraceToTriplet, TraceToTripletBase
 from agentlightning.llm_proxy import LLMProxy, ModelConfig
 from agentlightning.store.base import LightningStore
 from agentlightning.types import Rollout, RolloutConfig, Task
 
-setup_logging()
-
 __all__ = [
     "AgentModeDaemon",
     "get_left_padded_ids_and_attention_mask",
diff --git a/examples/calc_x/calc_agent.py b/examples/calc_x/calc_agent.py
@@ -84,14 +84,19 @@ async def calc_agent(task: MathProblem, llm: agl.LLM) -> None:
         try:
             output_format = "Output the answer when you are ready. The answer should be surrounded by three sharps (`###`), in the form of ### ANSWER: <answer> ###."
             prompt = task["question"] + " " + output_format
-            result = await calc_agent.run(task=prompt)
+            # Sometimes MCP tools can timeout. In that case, the whole agent will block.
+            # We thus set a timeout of 5 minutes so that the agent will not block indefinitely.
+            result = await asyncio.wait_for(calc_agent.run(task=prompt), timeout=300.0)
             # evaluate
             last_message = cast(str, result.messages[-1].content)  # type: ignore
             answer = re.search(r"###\s*ANSWER:\s*(.+?)(\s*###|$)", last_message)
             if answer:
                 answer = answer.group(1)
             else:
                 answer = last_message
+        except asyncio.TimeoutError as e:
+            print("Timeout occurred. Error:", str(e))
+            answer = "None"
         except Exception as e:
             print("Failure:", str(e))
             answer = "None"
diff --git a/examples/calc_x/train_calc_agent.py b/examples/calc_x/train_calc_agent.py
@@ -30,6 +30,7 @@
 
 import argparse
 import os
+import uuid
 from datetime import datetime
 from typing import Any, Dict, Optional, cast
 
@@ -146,20 +147,25 @@ def train(
     if ci or ci_fast:
         # Config the experiment name and project name so that they are available to CI
         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
-        EXPERIMENT_NAME = f"calc_x_{timestamp}"
+        random_suffix = uuid.uuid4().hex[:8]
+        EXPERIMENT_NAME = f"calc_x_{timestamp}_{random_suffix}"
 
         PROJECT_NAME = "AgentLightningCI"
 
-        # Simulate writing to $GITHUB_OUTPUT if it’s set
-        github_output = os.getenv("GITHUB_OUTPUT")
-        if github_output:
-            with open(github_output, "a") as f:
-                f.write(f"project_name={PROJECT_NAME}\n")
-                f.write(f"run_name={EXPERIMENT_NAME}\n")
+        # Skip this step if AGL_CURRENT_ROLE is runner
+        agl_current_role = os.getenv("AGL_CURRENT_ROLE")
 
-        print("Set environment variables:")
-        print(f"PROJECT_NAME={PROJECT_NAME}")
-        print(f"EXPERIMENT_NAME={EXPERIMENT_NAME}")
+        if agl_current_role != "runner":
+            # Simulate writing to $GITHUB_OUTPUT if it’s set
+            github_output = os.getenv("GITHUB_OUTPUT")
+            if github_output:
+                with open(github_output, "a") as f:
+                    f.write(f"project_name={PROJECT_NAME}\n")
+                    f.write(f"run_name={EXPERIMENT_NAME}\n")
+
+            print("Set environment variables:")
+            print(f"PROJECT_NAME={PROJECT_NAME}")
+            print(f"EXPERIMENT_NAME={EXPERIMENT_NAME}")
 
         # Keep it tiny/light without adding new knobs
         config["actor_rollout_ref"]["rollout"]["gpu_memory_utilization"] = 0.8
@@ -210,6 +216,7 @@ def main():
         default="",
         help="Connect to an external store instead of creating a new one in memory",
     )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
 
     args = parser.parse_args()
 
@@ -224,6 +231,8 @@ def main():
     if args.ci_fast:
         args.ci = True
 
+    agl.setup_logging("DEBUG" if args.debug else "INFO")
+
     train(
         train_file=args.train_file,
         val_file=args.val_file,
diff --git a/scripts/validate_example_wandb.py b/scripts/validate_example_wandb.py
diff --git a/tests/execution/test_client_server.py b/tests/execution/test_client_server.py