Skip to content

Commit 0f3d3e3

Browse files
yewentao256mtparet
authored andcommitted
[Log] Reduce duplicate log (vllm-project#37313)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
1 parent 899df73 commit 0f3d3e3

8 files changed

Lines changed: 20 additions & 10 deletions

File tree

vllm/compilation/backends.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,13 +371,15 @@ def autograd_cache_key(*args, **kwargs):
371371
logger.info_once(
372372
"Cache the graph of compile range %s for later use",
373373
str(compile_range),
374+
scope="local",
374375
)
375-
logger.debug(
376+
logger.debug_once(
376377
"Store the %s-th graph for compile range%s from %s via handle %s",
377378
graph_index,
378379
str(compile_range),
379380
self.compiler.name,
380381
handle,
382+
scope="local",
381383
)
382384

383385
# after compiling the last graph, record the end time

vllm/config/scheduler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,10 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
228228
self.encoder_cache_size = self.max_num_batched_tokens
229229

230230
if self.enable_chunked_prefill:
231-
logger.info(
231+
logger.info_once(
232232
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
233233
self.max_num_batched_tokens,
234+
scope="local",
234235
)
235236

236237
if self.max_num_partial_prefills > 1:

vllm/model_executor/layers/attention/mm_encoder_attention.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,9 @@ def __init__(
227227
if self.attn_backend == AttentionBackendEnum.FLASHINFER:
228228
_get_flashinfer_workspace_buffer()
229229

230-
logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
230+
logger.info_once(
231+
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
232+
)
231233

232234
@classmethod
233235
def enabled(cls) -> bool:

vllm/model_executor/models/qwen3_next.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,14 +192,15 @@ def __init__(self) -> None:
192192
use_flashinfer = supports_flashinfer
193193

194194
if use_flashinfer:
195-
logger.info_once("Using FlashInfer GDN prefill kernel")
195+
logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
196196
logger.info_once(
197197
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
198198
"take a while to compile. Set `--gdn-prefill-backend triton` to "
199-
"avoid JIT compile time."
199+
"avoid JIT compile time.",
200+
scope="local",
200201
)
201202
else:
202-
logger.info_once("Using Triton/FLA GDN prefill kernel")
203+
logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
203204

204205
self._forward_method = (
205206
self.forward_cuda if use_flashinfer else self.forward_native

vllm/platforms/cuda.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,8 @@ def get_vit_attn_backend(
387387
)
388388
if is_backend_supported:
389389
logger.info_once(
390-
f"Using backend {vit_attn_backend} for vit attention"
390+
f"Using backend {vit_attn_backend} for vit attention",
391+
scope="local",
391392
)
392393
return vit_attn_backend
393394
except ImportError:

vllm/v1/executor/multiproc_executor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
998998
"OMP_NUM_THREADS" not in os.environ
999999
and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
10001000
):
1001-
logger.warning(
1001+
logger.warning_once(
10021002
"Reducing Torch parallelism from %d threads to %d to avoid "
10031003
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
10041004
"external environment to tune this value as needed.",
10051005
current_parallelism,
10061006
default_omp_num_threads,
1007+
scope="local",
10071008
)
10081009
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
10091010
torch.set_num_threads(default_omp_num_threads)

vllm/v1/worker/dp_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
2828
# this optimization if we run into this case.
2929
if parallel_config.disable_nccl_for_dp_synchronization:
3030
logger.info_once(
31-
"Using CPU all reduce to synchronize DP padding between ranks."
31+
"Using CPU all reduce to synchronize DP padding between ranks.",
32+
scope="local",
3233
)
3334
device = "cpu"
3435
group = get_dp_group().cpu_group

vllm/v1/worker/gpu_model_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5510,13 +5510,14 @@ def profile_run(self) -> None:
55105510
dummy_modality
55115511
]
55125512

5513-
logger.info(
5513+
logger.info_once(
55145514
"Encoder cache will be initialized with a "
55155515
"budget of %s tokens, and profiled with "
55165516
"%s %s items of the maximum feature size.",
55175517
encoder_budget,
55185518
max_mm_items_per_batch,
55195519
dummy_modality,
5520+
scope="local",
55205521
)
55215522

55225523
# Create dummy batch of multimodal inputs.

0 commit comments

Comments
 (0)