[Log] Reduce duplicate log (vllm-project#37313)

yewentao256 · mtparet · commit 0f3d3e36c81a · 2026-04-09T20:48:18.000+02:00
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -371,13 +371,15 @@ def autograd_cache_key(*args, **kwargs):
                 logger.info_once(
                     "Cache the graph of compile range %s for later use",
                     str(compile_range),
+                    scope="local",
                 )
-            logger.debug(
+            logger.debug_once(
                 "Store the %s-th graph for compile range%s from %s via handle %s",
                 graph_index,
                 str(compile_range),
                 self.compiler.name,
                 handle,
+                scope="local",
             )
 
         # after compiling the last graph, record the end time
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -228,9 +228,10 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
         self.encoder_cache_size = self.max_num_batched_tokens
 
         if self.enable_chunked_prefill:
-            logger.info(
+            logger.info_once(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
                 self.max_num_batched_tokens,
+                scope="local",
             )
 
         if self.max_num_partial_prefills > 1:
diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -227,7 +227,9 @@ def __init__(
         if self.attn_backend == AttentionBackendEnum.FLASHINFER:
             _get_flashinfer_workspace_buffer()
 
-        logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
+        logger.info_once(
+            f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
+        )
 
     @classmethod
     def enabled(cls) -> bool:
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
@@ -192,14 +192,15 @@ def __init__(self) -> None:
             use_flashinfer = supports_flashinfer
 
         if use_flashinfer:
-            logger.info_once("Using FlashInfer GDN prefill kernel")
+            logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
             logger.info_once(
                 "FlashInfer GDN prefill kernel is JIT-compiled; first run may "
                 "take a while to compile. Set `--gdn-prefill-backend triton` to "
-                "avoid JIT compile time."
+                "avoid JIT compile time.",
+                scope="local",
             )
         else:
-            logger.info_once("Using Triton/FLA GDN prefill kernel")
+            logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
 
         self._forward_method = (
             self.forward_cuda if use_flashinfer else self.forward_native
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -387,7 +387,8 @@ def get_vit_attn_backend(
                     )
                 if is_backend_supported:
                     logger.info_once(
-                        f"Using backend {vit_attn_backend} for vit attention"
+                        f"Using backend {vit_attn_backend} for vit attention",
+                        scope="local",
                     )
                     return vit_attn_backend
             except ImportError:
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
         "OMP_NUM_THREADS" not in os.environ
         and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
     ):
-        logger.warning(
+        logger.warning_once(
             "Reducing Torch parallelism from %d threads to %d to avoid "
             "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
             "external environment to tune this value as needed.",
             current_parallelism,
             default_omp_num_threads,
+            scope="local",
         )
         os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
         torch.set_num_threads(default_omp_num_threads)
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
     # this optimization if we run into this case.
     if parallel_config.disable_nccl_for_dp_synchronization:
         logger.info_once(
-            "Using CPU all reduce to synchronize DP padding between ranks."
+            "Using CPU all reduce to synchronize DP padding between ranks.",
+            scope="local",
         )
         device = "cpu"
         group = get_dp_group().cpu_group
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -5510,13 +5510,14 @@ def profile_run(self) -> None:
                             dummy_modality
                         ]
 
-                        logger.info(
+                        logger.info_once(
                             "Encoder cache will be initialized with a "
                             "budget of %s tokens, and profiled with "
                             "%s %s items of the maximum feature size.",
                             encoder_budget,
                             max_mm_items_per_batch,
                             dummy_modality,
+                            scope="local",
                         )
 
                         # Create dummy batch of multimodal inputs.

Original file line number	Diff line number	Diff line change
`@@ -387,7 +387,8 @@ def get_vit_attn_backend(`
`387`	`387`	`)`
`388`	`388`	`if is_backend_supported:`
`389`	`389`	`logger.info_once(`
`390`		`- f"Using backend {vit_attn_backend} for vit attention"`
	`390`	`+ f"Using backend {vit_attn_backend} for vit attention",`
	`391`	`+ scope="local",`
`391`	`392`	`)`
`392`	`393`	`return vit_attn_backend`
`393`	`394`	`except ImportError:`
Original file line number	Diff line number	Diff line change
`@@ -5510,13 +5510,14 @@ def profile_run(self) -> None:`
`5510`	`5510`	`dummy_modality`
`5511`	`5511`	`]`
`5512`	`5512`
`5513`		`- logger.info(`
	`5513`	`+ logger.info_once(`
`5514`	`5514`	`"Encoder cache will be initialized with a "`
`5515`	`5515`	`"budget of %s tokens, and profiled with "`
`5516`	`5516`	`"%s %s items of the maximum feature size.",`
`5517`	`5517`	`encoder_budget,`
`5518`	`5518`	`max_mm_items_per_batch,`
`5519`	`5519`	`dummy_modality,`
	`5520`	`+ scope="local",`
`5520`	`5521`	`)`
`5521`	`5522`
`5522`	`5523`	`# Create dummy batch of multimodal inputs.`