Revise the code based on the review feedback

zhyajie · zhyajie · commit f3cc742e5eb2 · 2026-02-08T12:06:26.000Z
diff --git a/python/sglang/multimodal_gen/benchmarks/bench_serving.py b/python/sglang/multimodal_gen/benchmarks/bench_serving.py
@@ -17,6 +17,7 @@
 
 import argparse
 import asyncio
+import copy
 import glob
 import json
 import os
@@ -314,8 +315,8 @@ def __getitem__(self, idx: int) -> RequestFuncInput:
             height=self.args.height,
             num_frames=self.args.num_frames,
             fps=self.args.fps,
-            num_inference_steps=getattr(self.args, "num_inference_steps", None),
-            guidance_scale=getattr(self.args, "guidance_scale", None),
+            num_inference_steps=self.args.num_inference_steps,
+            guidance_scale=self.args.guidance_scale,
             image_paths=image_paths,
         )
 
@@ -374,7 +375,7 @@ async def async_request_image_sglang(
             data.add_field("guidance_scale", str(input.guidance_scale))
 
         # Add profiling and other extra parameters
-        extra_params = input.extra_body.copy()
+        extra_params = copy.deepcopy(input.extra_body)
         if extra_params.pop("profile", None):
             data.add_field("profile", "true")
         for key, value in extra_params.items():
@@ -766,9 +767,9 @@ async def limited_request_func(req, session, pbar):
             api_url=f"{args.base_url}/start_profile"
         )
         if profile_output.success:
-            print(f"Profiler started: {profile_output.message}")
+            logger.info(f"Profiler started: {profile_output.message}")
         else:
-            print(f"Warning: Failed to start profiler: {profile_output.error}")
+            logger.warning(f"Failed to start profiler: {profile_output.error}")
 
     # Run benchmark
     pbar = tqdm(total=len(requests_list), disable=args.disable_tqdm)
@@ -792,77 +793,81 @@ async def limited_request_func(req, session, pbar):
 
     # Stop profiler if it was started
     if args.profile:
-        print("Stopping profiler and saving traces...")
+        logger.info("Stopping profiler and saving traces...")
         profile_output = await async_request_profile(
             api_url=f"{args.base_url}/stop_profile"
         )
         if profile_output.success:
-            print(f"Profiler stopped: {profile_output.message}")
+            logger.info(f"Profiler stopped: {profile_output.message}")
         else:
-            print(f"Warning: Failed to stop profiler: {profile_output.error}")
+            logger.warning(f"Failed to stop profiler: {profile_output.error}")
 
     # Calculate metrics
     metrics = calculate_metrics(outputs, total_duration)
 
-    print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=60, c="="))
+    logger.info("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=60, c="="))
 
     # Section 1: Configuration
-    print("{:<40} {:<15}".format("Task:", task_name))
-    print("{:<40} {:<15}".format("Model:", args.model))
-    print("{:<40} {:<15}".format("Dataset:", args.dataset))
+    logger.info("{:<40} {:<15}".format("Task:", task_name))
+    logger.info("{:<40} {:<15}".format("Model:", args.model))
+    logger.info("{:<40} {:<15}".format("Dataset:", args.dataset))
 
     # Section 2: Execution & Traffic
-    print(f"{'-' * 50}")
-    print("{:<40} {:<15.2f}".format("Benchmark duration (s):", metrics["duration"]))
-    print("{:<40} {:<15}".format("Request rate:", str(args.request_rate)))
-    print(
+    logger.info(f"{'-' * 50}")
+    logger.info(
+        "{:<40} {:<15.2f}".format("Benchmark duration (s):", metrics["duration"])
+    )
+    logger.info("{:<40} {:<15}".format("Request rate:", str(args.request_rate)))
+    logger.info(
         "{:<40} {:<15}".format(
             "Max request concurrency:",
             str(args.max_concurrency) if args.max_concurrency else "not set",
         )
     )
-    print(
+    logger.info(
         "{:<40} {}/{:<15}".format(
             "Successful requests:", metrics["completed_requests"], len(requests_list)
         )
     )
 
     # Section 3: Performance Metrics
-    print(f"{'-' * 50}")
+    logger.info(f"{'-' * 50}")
 
-    print(
+    logger.info(
         "{:<40} {:<15.2f}".format(
             "Request throughput (req/s):", metrics["throughput_qps"]
         )
     )
-    print("{:<40} {:<15.4f}".format("Latency Mean (s):", metrics["latency_mean"]))
-    print("{:<40} {:<15.4f}".format("Latency Median (s):", metrics["latency_median"]))
-    print("{:<40} {:<15.4f}".format("Latency P99 (s):", metrics["latency_p99"]))
+    logger.info("{:<40} {:<15.4f}".format("Latency Mean (s):", metrics["latency_mean"]))
+    logger.info(
+        "{:<40} {:<15.4f}".format("Latency Median (s):", metrics["latency_median"])
+    )
+    logger.info("{:<40} {:<15.4f}".format("Latency P99 (s):", metrics["latency_p99"]))
 
     if metrics["peak_memory_mb_max"] > 0:
-        print(f"{'-' * 50}")
-        print(
+        logger.info(f"{'-' * 50}")
+        logger.info(
             "{:<40} {:<15.2f}".format(
                 "Peak Memory Max (MB):", metrics["peak_memory_mb_max"]
             )
         )
-        print(
+        logger.info(
             "{:<40} {:<15.2f}".format(
                 "Peak Memory Mean (MB):", metrics["peak_memory_mb_mean"]
             )
         )
-        print(
+        logger.info(
             "{:<40} {:<15.2f}".format(
                 "Peak Memory Median (MB):", metrics["peak_memory_mb_median"]
             )
         )
 
-    print("=" * 60)
+    logger.info("=" * 60)
 
     if args.output_file:
         with open(args.output_file, "w") as f:
             json.dump(metrics, f, indent=2)
-        print(f"Metrics saved to {args.output_file}")
+        logger.info(f"Metrics saved to {args.output_file}")
 
 
 if __name__ == "__main__":
diff --git a/python/sglang/multimodal_gen/runtime/entrypoints/http_server.py b/python/sglang/multimodal_gen/runtime/entrypoints/http_server.py
@@ -13,19 +13,23 @@
 from pydantic import BaseModel
 
 from sglang.multimodal_gen.configs.sample.sampling_params import SamplingParams
+from sglang.multimodal_gen.runtime.distributed.parallel_state import get_world_rank
 from sglang.multimodal_gen.runtime.entrypoints.openai import image_api, video_api
 from sglang.multimodal_gen.runtime.entrypoints.openai.protocol import (
     VertexGenerateReqInput,
 )
+from sglang.multimodal_gen.runtime.entrypoints.openai.utils import (
+    StartProfileReq,
+    StopProfileReq,
+)
 from sglang.multimodal_gen.runtime.entrypoints.utils import (
     post_process_sample,
     prepare_request,
 )
 from sglang.multimodal_gen.runtime.scheduler_client import async_scheduler_client
 from sglang.multimodal_gen.runtime.server_args import ServerArgs, get_global_server_args
-from sglang.multimodal_gen.runtime.utils.common import get_bool_env_var
 from sglang.multimodal_gen.runtime.utils.logging_utils import init_logger
-from sglang.multimodal_gen.runtime.utils.profiler import SGLDiffusionProfiler
+from sglang.srt.environ import envs
 
 logger = init_logger(__name__)
 
@@ -146,30 +150,22 @@ async def start_profile(request: Request, obj: Optional[ProfileReqInput] = None)
         if obj is None:
             obj = ProfileReqInput()
 
-        output_dir = obj.output_dir or os.getenv("SGLANG_TORCH_PROFILER_DIR", "./logs")
+        output_dir = obj.output_dir or envs.SGLANG_TORCH_PROFILER_DIR.get()
 
-        # Generate unified profile_id (similar to LLM implementation)
         profile_id = str(int(time_module.time()))
 
-        # Read env vars for with_stack and record_shapes
-        env_with_stack = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "false")
-        env_record_shapes = get_bool_env_var("SGLANG_PROFILE_RECORD_SHAPES", "false")
-
-        with_stack = obj.with_stack if obj.with_stack is not None else env_with_stack
-        record_shapes = (
-            obj.record_shapes if obj.record_shapes is not None else env_record_shapes
-        )
-
-        # 1. Start profiler in HTTP Server process
-        from sglang.multimodal_gen.runtime.distributed.parallel_state import (
-            get_world_rank,
-        )
+        with_stack = obj.with_stack or envs.SGLANG_PROFILE_WITH_STACK.get()
+        record_shapes = obj.record_shapes or envs.SGLANG_PROFILE_RECORD_SHAPES.get()
 
         try:
             rank = get_world_rank()
         except Exception:
+            logger.warning("Failed to get world rank, defaulting to 0")
             rank = 0
 
+        # Lazy import to reduce import time (see issue #10492)
+        from sglang.multimodal_gen.runtime.utils.profiler import SGLDiffusionProfiler
+
         http_profiler = SGLDiffusionProfiler(
             request_id=profile_id,
             rank=rank,
@@ -185,11 +181,6 @@ async def start_profile(request: Request, obj: Optional[ProfileReqInput] = None)
         _global_profiler_state["profiler"] = http_profiler
         _global_profiler_state["profile_id"] = profile_id
 
-        # 2. Start profiler in GPU Worker process via ZMQ
-        from sglang.multimodal_gen.runtime.entrypoints.openai.utils import (
-            StartProfileReq,
-        )
-
         start_req = StartProfileReq(
             output_dir=output_dir,
             profile_id=profile_id,
@@ -241,11 +232,6 @@ async def stop_profile():
         if profiler is not None:
             profiler.stop(export_trace=True, dump_rank=None)  # Save for all ranks
 
-        # 2. Stop profiler in GPU Worker process via ZMQ
-        from sglang.multimodal_gen.runtime.entrypoints.openai.utils import (
-            StopProfileReq,
-        )
-
         stop_req = StopProfileReq(export_trace=True)
         try:
             response = await async_scheduler_client.forward(stop_req)
diff --git a/python/sglang/multimodal_gen/runtime/utils/profiler.py b/python/sglang/multimodal_gen/runtime/utils/profiler.py
@@ -1,10 +1,12 @@
 import gzip
 import os
+from pathlib import Path
 
 import torch
 
+from sglang.multimodal_gen.runtime.platforms import current_platform
 from sglang.multimodal_gen.runtime.utils.logging_utils import CYAN, RESET, init_logger
-from sglang.srt.utils import get_bool_env_var
+from sglang.srt.environ import envs
 
 logger = init_logger(__name__)
 
@@ -38,12 +40,10 @@ def __init__(
         self.full_profile = full_profile
         self.is_host = is_host
 
-        # Use environment variables with fallback to parameters
-        self.log_dir = log_dir or os.getenv("SGLANG_TORCH_PROFILER_DIR", "./logs")
+        self.log_dir = log_dir or envs.SGLANG_TORCH_PROFILER_DIR.get()
 
-        # Read from environment variables, allow parameter override
-        env_with_stack = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "false")
-        env_record_shapes = get_bool_env_var("SGLANG_PROFILE_RECORD_SHAPES", "false")
+        env_with_stack = envs.SGLANG_PROFILE_WITH_STACK.get()
+        env_record_shapes = envs.SGLANG_PROFILE_RECORD_SHAPES.get()
 
         self.with_stack = with_stack if with_stack is not None else env_with_stack
         self.record_shapes = (
@@ -107,7 +107,7 @@ def _resolve_activities(
 
         def _default() -> list[torch.profiler.ProfilerActivity]:
             ret = [torch.profiler.ProfilerActivity.CPU]
-            if torch.cuda.is_available():
+            if current_platform.is_cuda_alike():
                 ret.append(torch.profiler.ProfilerActivity.CUDA)
             return ret
 
@@ -123,7 +123,7 @@ def _default() -> list[torch.profiler.ProfilerActivity]:
             if s == "cpu":
                 use_cpu = True
             elif s in ("gpu", "cuda"):
-                if torch.cuda.is_available():
+                if current_platform.is_cuda_alike():
                     use_cuda = True
                 else:
                     logger.warning(
@@ -169,7 +169,7 @@ def stop(self, export_trace: bool = True, dump_rank: int | None = None):
             return
         self.has_stopped = True
         logger.info("Stopping Profiler...")
-        if torch.cuda.is_available():
+        if current_platform.is_cuda_alike():
             torch.cuda.synchronize()
         self.profiler.stop()
 
@@ -194,7 +194,7 @@ def _export_trace(self):
             else:
                 filename = f"{self.request_id}-rank-{self.rank}.trace.json.gz"
 
-            trace_path = os.path.abspath(os.path.join(self.log_dir, filename))
+            trace_path = str(Path(self.log_dir, filename).resolve())
             self.profiler.export_chrome_trace(trace_path)
 
             if self._check_trace_integrity(trace_path):