[perf] feat: support profiler in model engine and sft trainer (#4749)

vermouth1992 · web-flow · commit 0a5e85627b21 · 2025-12-31T19:42:11.000+08:00
### What does this PR do? - As title ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).)
diff --git a/verl/trainer/config/profiler/profiler.yaml b/verl/trainer/config/profiler/profiler.yaml
@@ -0,0 +1,77 @@
+# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+_target_: verl.utils.profiler.ProfilerConfig
+
+# profiler tool, default same as profiler.tool in global config
+# choices: nsys, npu, torch
+tool: torch
+
+# whether enable profile on Actor
+enable: False
+
+# Whether to profile all ranks.
+all_ranks: False
+
+# The ranks that will be profiled. [] or [0,1,...]
+ranks: []
+
+# profile results saving path
+save_path: "outputs/profile"
+
+tool_config:
+  npu:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.config.NPUToolConfig
+
+    # Contents to profile, can be empty
+    # options: npu, cpu, memory, shapes, module, stack
+    contents: [ ]
+
+    # Collection level, optional values: level_none, level0, level1, level2.
+    level: "level0"
+
+    # Whether to automatically parse the data.
+    analysis: True
+
+    # True for each task has its own database, False for all tasks in one training step share one database.
+    discrete: False
+
+    name: npu
+
+
+  nsys:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.config.NsightToolConfig
+
+    # True for each task has its own database, False for all tasks in one training step share one database.
+    discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+
+    name: nsight
+
+  torch:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+
+    # start profile mini-batch in training
+    # NOTICE: different with global steps config which refers to iteration
+    # This field only related with mini-batch
+    step_start: 0
+
+    # stop profile mini-batch in training
+    step_end: null
+
+    # manual save
+    manual_save: True
+
+    name: torch
+
+  torch_memory:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+
+    # Maximum number of memory allocation entries to track
+    trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+
+    # Stack trace depth for memory allocations
+    stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+
+    name: torch_memory
diff --git a/verl/trainer/config/sft_trainer_engine.yaml b/verl/trainer/config/sft_trainer_engine.yaml
@@ -10,6 +10,7 @@ defaults:
   - model@model: hf_model
   - engine@engine: fsdp
   - optim@optim: fsdp
+  - profiler@profiler: profiler
   - _self_
 
 data:
@@ -78,3 +79,5 @@ trainer:
 
   nnodes: 1
   n_gpus_per_node: 1
+
+  profile_interval: [-1, -1]
diff --git a/verl/trainer/sft_trainer.py b/verl/trainer/sft_trainer.py
@@ -96,6 +96,19 @@ def _build_config(self):
         self.engine_config = omega_conf_to_dataclass(self.config.engine)
         self.optimizer_config = omega_conf_to_dataclass(self.config.optim)
         self.checkpoint_config = omega_conf_to_dataclass(self.config.checkpoint)
+        self.profiler_config = omega_conf_to_dataclass(self.config.profiler)
+
+        # check profile interval
+        self.profiler_interval = self.config.trainer.profile_interval
+        self._validate_profiler_interval()
+
+    def _validate_profiler_interval(self):
+        assert len(self.profiler_interval) == 2
+        self.start_profile_step = self.profiler_interval[0]
+        self.end_profile_step = self.profiler_interval[1]
+        assert self.end_profile_step >= self.start_profile_step
+        if self.start_profile_step < 0:
+            assert self.end_profile_step < 0
 
     def _build_engine(self):
         from verl.workers.engine_workers import TrainingWorkerConfig
@@ -109,6 +122,7 @@ def _build_engine(self):
             engine_config=self.engine_config,
             optimizer_config=self.optimizer_config,
             checkpoint_config=self.checkpoint_config,
+            profiler_config=self.profiler_config,
         )
 
         self.training_client = TrainingWorker(config=config)
@@ -303,9 +317,15 @@ def fit(self):
 
                 tu.assign_non_tensor(data, update_lr_scheduler=True, global_token_num=batch_seqlens)
 
+                # start profile in SPMD mode
+                if global_step == self.start_profile_step:
+                    self.training_client.start_profile()
                 # train for on batch
                 output = self.training_client.train_batch(data=data)
 
+                if global_step == self.end_profile_step:
+                    self.training_client.stop_profile()
+
                 if self.engine.is_mp_src_rank_with_outputs():
                     metrics = tu.get(output, "metrics")
 
diff --git a/verl/trainer/sft_trainer_ray.py b/verl/trainer/sft_trainer_ray.py
@@ -90,6 +90,19 @@ def _build_config(self):
         self.engine_config = omega_conf_to_dataclass(self.config.engine)
         self.optimizer_config = omega_conf_to_dataclass(self.config.optim)
         self.checkpoint_config = omega_conf_to_dataclass(self.config.checkpoint)
+        self.profiler_config = omega_conf_to_dataclass(self.config.profiler)
+
+        # check profile interval
+        self.profiler_interval = self.config.trainer.profile_interval
+        self._validate_profiler_interval()
+
+    def _validate_profiler_interval(self):
+        assert len(self.profiler_interval) == 2
+        self.start_profile_step = self.profiler_interval[0]
+        self.end_profile_step = self.profiler_interval[1]
+        assert self.end_profile_step >= self.start_profile_step
+        if self.start_profile_step < 0:
+            assert self.end_profile_step < 0
 
     def _build_engine(self):
         from verl.workers.engine_workers import TrainingWorkerConfig
@@ -103,6 +116,7 @@ def _build_engine(self):
             engine_config=self.engine_config,
             optimizer_config=self.optimizer_config,
             checkpoint_config=self.checkpoint_config,
+            profiler_config=self.profiler_config,
         )
 
         # create resource pool and worker group
@@ -279,10 +293,16 @@ def fit(self):
 
                 tu.assign_non_tensor(data, update_lr_scheduler=True, global_token_num=batch_seqlens)
 
+                # start profile in SPMD mode
+                if global_step == self.start_profile_step:
+                    self.training_client.start_profile()
                 # train for on batch
                 output = self.training_client.train_batch(data)
                 output = output.get()
 
+                if global_step == self.end_profile_step:
+                    self.training_client.stop_profile()
+
                 metrics = tu.get(output, "metrics")
 
                 # TODO: we can actual accumulate metrics for N steps and perform aggregate metrics
diff --git a/verl/utils/profiler/__init__.py b/verl/utils/profiler/__init__.py
@@ -15,7 +15,7 @@
 from ..device import is_npu_available
 from ..import_utils import is_nvtx_available
 from .performance import GPUMemoryLogger, log_gpu_memory_usage, simple_timer
-from .profile import DistProfiler, DistProfilerExtension, ProfilerConfig
+from .profile import DistProfiler, DistProfilerExtension, Profiler, ProfilerConfig
 
 # Select marker implementations by availability, but keep DistProfiler as our dispatcher
 if is_nvtx_available():
@@ -34,6 +34,7 @@
     "mark_annotate",
     "DistProfiler",
     "DistProfilerExtension",
+    "Profiler",
     "ProfilerConfig",
     "simple_timer",
     "marked_timer",
diff --git a/verl/utils/profiler/config.py b/verl/utils/profiler/config.py
@@ -27,6 +27,7 @@ class NsightToolConfig(BaseConfig):
 
     "True for each task has its own database, False for all tasks in one training step share one database."
     discrete: bool = False
+    name: str = "nsight"
 
     def __post_init__(self) -> None:
         pass
@@ -43,6 +44,8 @@ class TorchProfilerToolConfig(BaseConfig):
 
     step_start: int = -1
     step_end: int = -1
+    manual_save: bool = True
+    name: str = "torch"
 
     def __post_init__(self) -> None:
         """config validation logics go here"""
@@ -61,6 +64,7 @@ class TorchMemoryToolConfig(BaseConfig):
 
     trace_alloc_max_entries: int = 100_000
     stack_depth: int = 32
+    name: str = "torch_memory"
 
     def __post_init__(self) -> None:
         """config validation logics go here"""
@@ -87,6 +91,8 @@ class NPUToolConfig(NsightToolConfig):
     # Whether to automatically parse the data.
     analysis: bool = False
 
+    name: str = "npu"
+
     def __post_init__(self) -> None:
         """config validation logics go here"""
         assert isinstance(self.contents, list), f"Profiler contents must be of type list, got {type(self.contents)}"
diff --git a/verl/utils/profiler/profile.py b/verl/utils/profiler/profile.py
@@ -40,10 +40,15 @@ class Profiler:
         config: Configuration object containing profiling parameters
     """
 
-    def __init__(self, config: ProfilerConfig, tool_config: Optional[TorchProfilerToolConfig] = None):
+    def __init__(
+        self, config: ProfilerConfig, tool_config: Optional[TorchProfilerToolConfig] = None, save_file_prefix=None
+    ):
         # note : if we do not set use_profile, it will be set as None, so that all function will be skip
         if not config:
             config = ProfilerConfig(ranks=[], enable=False)
+
+        self.save_file_prefix = save_file_prefix
+
         if not tool_config:
             assert not config.enable, "tool_config must be provided when profiler is enabled"
         self.prof = None
@@ -56,7 +61,8 @@ def __init__(self, config: ProfilerConfig, tool_config: Optional[TorchProfilerTo
         self.rank = torch.distributed.get_rank()
         # we need to validate the config before using the profiler
         self._validate()
-        if self.rank in self.config.profile_ranks:
+
+        if self.rank in self.config.ranks or self.config.all_ranks:
             print(f"[Profiler] Profiler init for rank {self.rank}")
 
             self.prof = torch.profiler.profile(
@@ -74,11 +80,24 @@ def __init__(self, config: ProfilerConfig, tool_config: Optional[TorchProfilerTo
                 with_stack=True,
             )
 
+    def _trace_handler(self, prof):
+        if not os.path.exists(self.config.save_path):
+            os.makedirs(self.config.save_path)
+
+        save_file_name = f"prof_rank-{self.rank}.json.gz"
+        if self.save_file_prefix is not None:
+            save_file_name = self.save_file_prefix + "_" + save_file_name
+        save_path = os.path.join(self.config.save_path, save_file_name)
+        print(f"[Profiler] Saving trace to {save_path}")
+        prof.export_chrome_trace(save_path)
+        self.enable = False
+        self.saved = True
+
     def _validate(self):
         if self.enable:
-            if self.config.profile_ranks is None:
+            if self.config.ranks is None:
                 print("[WARNING] Profile ranks is not set, default to rank 0")
-                self.config.profile_ranks = [0]
+                self.config.ranks = [0]
             assert self.tool_config.step_start >= 0, "[ERROR] Profile step start must be greater than 0"
             assert self.tool_config.step_end >= 0, "[ERROR] Profile step end must be greater than 0"
             assert self.tool_config.step_start < self.tool_config.step_end, (
@@ -99,18 +118,14 @@ def step(self):
 
     def stop(self):
         if self.check():
+            self.step()
             print(f"[Profiler] stopped for rank {self.rank}")
             self.prof.stop()
+            self.save()
 
     def save(self):
-        if self.prof is not None and not self.saved:
-            if not os.path.exists(self.config.save_path):
-                os.makedirs(self.config.save_path)
-            save_file_name = f"/prof_start_{self.config.step_start}_end_{self.config.step_end}_rank_{self.rank}.json"
-            print(f"[Profiler] Saving trace to {self.config.save_path + save_file_name}")
-            self.prof.export_chrome_trace(self.config.save_path + save_file_name)
-            self.enable = False
-            self.saved = True
+        if self.prof is not None and not self.saved and self.tool_config.manual_save:
+            self._trace_handler(prof=self.prof)
 
     def stop_and_save(self):
         if self.check():
@@ -188,7 +203,10 @@ def __init__(
     ):
         # Default config
         if not config:
-            config = ProfilerConfig(ranks=[], enable=False)
+            config = ProfilerConfig(ranks=[], enable=False, tool_config=None)
+
+        if tool_config is None:
+            tool_config = config.tool_config
 
         self._impl = None
         self._tool = getattr(config, "tool", None)
diff --git a/verl/utils/seqlen_balancing.py b/verl/utils/seqlen_balancing.py
@@ -68,6 +68,7 @@ def karmarkar_karp(seqlen_list: list[int], k_partitions: int, equal_size: bool)
     Note:
         When equal_size=True, len(seqlen_list) must be divisible by k_partitions.
     """
+
     # see: https://en.wikipedia.org/wiki/Largest_differencing_method
     class Set:
         def __init__(self) -> None:
diff --git a/verl/utils/torch_functional.py b/verl/utils/torch_functional.py
@@ -200,9 +200,7 @@ def logprobs_from_logits_v2(logits: torch.FloatTensor, labels: torch.Tensor) ->
     return logprobs_labels
 
 
-def clip_by_value(
-    x: torch.Tensor, tensor_min: torch.Tensor, tensor_max: torch.Tensor
-) -> torch.Tensor:
+def clip_by_value(x: torch.Tensor, tensor_min: torch.Tensor, tensor_max: torch.Tensor) -> torch.Tensor:
     """Clip tensor values to a range defined by tensor bounds.
 
     Extension of torch.clamp that supports tensor-valued min/max bounds
@@ -265,9 +263,7 @@ def entropy_from_logits_with_chunking(logits: torch.Tensor, chunk_size: int = 20
     return entropy
 
 
-def masked_sum(
-    values: torch.Tensor, mask: torch.Tensor, axis: int | tuple[int, ...] | None = None
-) -> torch.Tensor:
+def masked_sum(values: torch.Tensor, mask: torch.Tensor, axis: int | tuple[int, ...] | None = None) -> torch.Tensor:
     """Compute sum of tensor values where mask is True.
 
     NaN values outside the mask are replaced with zeros to prevent
@@ -389,9 +385,7 @@ def compute_grad_norm(model: nn.Module) -> float:
     return total_grad_square
 
 
-def broadcast_dict_tensor(
-    tensors: dict[str, torch.Tensor] | TensorDict, src: int, group
-) -> None:
+def broadcast_dict_tensor(tensors: dict[str, torch.Tensor] | TensorDict, src: int, group) -> None:
     """Broadcast all tensors in a dictionary from source rank to all ranks.
 
     Iterates over all tensors in the dictionary and broadcasts each one
diff --git a/verl/workers/config/engine.py b/verl/workers/config/engine.py
@@ -19,6 +19,7 @@
 from verl.base_config import BaseConfig
 from verl.trainer.config import CheckpointConfig
 
+from ...utils.profiler import ProfilerConfig
 from .model import HFModelConfig
 from .optimizer import OptimizerConfig
 
@@ -273,3 +274,4 @@ class TrainingWorkerConfig(BaseConfig):
     engine_config: EngineConfig = None
     optimizer_config: OptimizerConfig = None
     checkpoint_config: CheckpointConfig = None
+    profiler_config: ProfilerConfig = None
diff --git a/verl/workers/engine_workers.py b/verl/workers/engine_workers.py