discrete profiler support agent loop

mengchengTang · mengchengTang · commit b76689b757ed · 2025-12-04T17:02:52.000+08:00
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
@@ -819,6 +819,7 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
 
         # Fix for Issue #4147: Always call wake_up() to ensure weight sync
         # The wake_up()/sleep() methods internally check free_cache_engine
+        self._start_profile(role="rollout_generate")
         self.wake_up()
         if self.reward_model_manager:
             self.reward_model_manager.wake_up()
@@ -841,6 +842,7 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         timing = self._performance_metrics(metrics, output)
 
         output.meta_info = {"timing": timing, **outputs[0].meta_info}
+        self._stop_profile()
         return output
 
     def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
@@ -877,6 +879,14 @@ def clear_kv_cache(self):
         """Clear all rollout kv cache, but don`t sleep."""
         self._run_all([replica.clear_kv_cache() for replica in self.rollout_replicas])
 
+    def _start_profile(self, **kwargs):
+        """Start profiling on all rollout replicas."""
+        self._run_all([replica.start_profile(**kwargs) for replica in self.rollout_replicas])
+
+    def _stop_profile(self):
+        """Stop profiling on all rollout replicas."""
+        self._run_all([replica.stop_profile() for replica in self.rollout_replicas])
+
     def _run_all(self, tasks: list[asyncio.Task]):
         async def run_all():
             await asyncio.gather(*tasks)
diff --git a/verl/utils/profiler/mstx_profile.py b/verl/utils/profiler/mstx_profile.py
@@ -214,6 +214,43 @@ def stop(self):
                 self.profile_npu.stop()
                 NPUProfiler._define_count -= 1
 
+    def capture_start(self, **kwargs):
+        """Start an on-demand profiling segment."""
+        if not (self.enable and self.this_step):
+            return
+
+        message = kwargs.get("message")
+        role = kwargs.get("role")
+        profile_name = message or role
+
+        if not self.discrete:
+            self._capture_range_id = mark_start_range(message=profile_name)
+        else:
+            self.capture_profiler_npu = get_npu_profiler(
+                contents=self.profile_contents,
+                profile_level=self.profile_level,
+                profile_save_path=self.profile_save_path,
+                analysis=self.analysis,
+                role=role,
+            )
+            self.capture_profiler_npu.start()
+            self._capture_range_id = mark_start_range(message=profile_name)
+
+    def capture_stop(self):
+        """Stop the on-demand profiling segment."""
+        if not (self.enable and self.this_step):
+            return
+
+        # End manual range
+        if hasattr(self, "_capture_range_id"):
+            mark_end_range(self._capture_range_id)
+            del self._capture_range_id
+
+        if self.discrete and getattr(self, "capture_profiler_npu", None):
+            self.capture_profiler_npu.step()
+            self.capture_profiler_npu.stop()
+            del self.capture_profiler_npu
+
     def annotate(self, message: Optional[str] = None, role: Optional[str] = None, **kwargs_outer) -> Callable:
         """Decorate a Worker member function to profile the current rank in the current training step.
 
diff --git a/verl/utils/profiler/profile.py b/verl/utils/profiler/profile.py
@@ -227,6 +227,12 @@ def start(self, **kwargs):
     def stop(self):
         return getattr(self._impl, "stop", lambda: None)()
 
+    def capture_start(self, **kwargs):
+        return getattr(self._impl, "capture_start", lambda **_: None)(**kwargs)
+
+    def capture_stop(self):
+        return getattr(self._impl, "capture_stop", lambda: None)()
+
     @classmethod
     def annotate(
         cls,
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
@@ -1939,6 +1939,16 @@ async def sleep(self):
         await self.trainer_mode()
         return True
 
+    @register(dispatch_mode=Dispatch.DIRECT_ROLLOUT_METHOD)
+    async def start_capture_profile(self, **kwargs):
+        self.profiler.capture_start(**kwargs)
+        return True
+
+    @register(dispatch_mode=Dispatch.DIRECT_ROLLOUT_METHOD)
+    async def stop_capture_profile(self):
+        self.profiler.capture_stop()
+        return True
+
     # ============================ vLLM related ============================
 
     @register(dispatch_mode=Dispatch.DIRECT_ROLLOUT_METHOD)
diff --git a/verl/workers/rollout/replica.py b/verl/workers/rollout/replica.py
@@ -216,6 +216,14 @@ async def clear_kv_cache(self):
         """reset kv cache in each rollout server."""
         await asyncio.gather(*[server.clear_kv_cache.remote() for server in self.servers])
 
+    async def start_profile(self, **kwargs):
+        """Start profiling on all workers."""
+        await asyncio.gather(*[worker.start_capture_profile.remote(**kwargs) for worker in self.workers])
+
+    async def stop_profile(self):
+        """Stop profiling on all workers."""
+        await asyncio.gather(*[worker.stop_capture_profile.remote() for worker in self.workers])
+
 
 class RolloutReplicaRegistry:
     """Factory for managing rollout replica implementations."""