Skip to content

Commit e56ecb9

Browse files
feat: improving heartbeat message (#348)
<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. --> <!-- SPDX-License-Identifier: Apache-2.0 --> <!-- Thank you for contributing to Safe Synthesizer! --> # Summary Improve the heartbeat message by adding an explanation that it is normal to have long stretches with no new records. ## Pre-Review Checklist <!-- These checks should be completed before a PR is reviewed, --> <!-- but you can submit a draft early to indicate that the issue is being worked on. --> Ensure that the following pass: - [x] `make format && make check` or via prek validation. - [x] `make test` passes locally - [x] `make test-e2e` passes locally - [ ] `make test-ci-container` passes locally (recommended) - [ ] GPU CI status check passes -- comment `/sync` on this PR to trigger a run (auto-triggers on ready-for-review) ## Pre-Merge Checklist <!-- These checks need to be completed before a PR is merged, --> <!-- but as PRs often change significantly during review, --> <!-- it's OK for them to be incomplete when review is first requested. --> - [ ] New or updated tests for any fix or new behavior - [ ] Updated documentation for new features and behaviors, including docstrings for API docs. ## Other Notes <!-- Please add the issue number that should be closed when this PR is merged. --> - Closes #<issue> --------- Signed-off-by: Alexa Haushalter <ahaushalter@nvidia.com>
1 parent 88a915d commit e56ecb9

4 files changed

Lines changed: 25 additions & 3 deletions

File tree

docs/user-guide/troubleshooting.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,10 @@ to cause long generation times:
278278
`max_tokens` scales with context window: each generation prompt is allowed
279279
up to `max_seq_length` output tokens (`12,288` for SmolLM3). If the model
280280
produces long outputs before the stop condition fires, each prompt in the
281-
batch takes proportionally longer. A heartbeat log (`"Generation in
282-
progress"`) is emitted every 60 seconds to confirm the pipeline is alive.
281+
batch takes proportionally longer. Every 60 seconds a heartbeat line is
282+
logged starting with `Generation in progress`, plus a short note that new
283+
records only appear after a full batch of prompts finishes. Long
284+
stretches with no new records are normal while generation is still running.
283285

284286
Long-tail batch latency: vLLM processes all prompts in a batch
285287
simultaneously, but `llm.generate()` blocks until every prompt completes.

src/nemo_safe_synthesizer/generation/vllm_backend.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@ def generate(
532532
"Generation",
533533
logger_name=__name__,
534534
target_records=self.config.generation.num_records,
535+
progress_note=("Long stretches with no new records are normal."),
535536
):
536537
while batches.num_valid_records < self.config.generation.num_records:
537538
# Generate a batch from prompts and process the responses.

src/nemo_safe_synthesizer/observability.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,7 @@ def heartbeat(
955955
interval: float = 60.0,
956956
*,
957957
logger_name: str | None = None,
958+
progress_note: str | None = None,
958959
**extra_fields,
959960
) -> Generator[None, None, None]:
960961
"""Context manager that logs a periodic heartbeat during a long-running operation.
@@ -964,6 +965,8 @@ def heartbeat(
964965
interval: Seconds between heartbeat log messages.
965966
logger_name: Logger name (pass ``__name__`` so heartbeat logs attribute
966967
to the calling module).
968+
progress_note: Optional sentence appended only to periodic ``... in progress``
969+
lines (so ``message`` can stay short for ``... complete`` / ``... failed``).
967970
**extra_fields: Additional structured fields passed to the logger
968971
(e.g. ``model="SmolLM3"``).
969972
"""
@@ -978,7 +981,10 @@ def _extra() -> dict:
978981

979982
def _run() -> None:
980983
while not stop.wait(timeout=interval):
981-
_logger.info(f"{message} in progress", extra={"ctx": _extra()})
984+
event = f"{message} in progress"
985+
if progress_note:
986+
event = f"{event}. {progress_note}"
987+
_logger.info(event, extra={"ctx": _extra()})
982988

983989
thread = threading.Thread(target=_run, daemon=True)
984990
thread.start()

tests/observability/test_observability.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,19 @@ def test_heartbeat_logs_progress_on_long_operation(self, caplog):
638638
assert "Slow op in progress" in caplog.text
639639
assert "Slow op complete" in caplog.text
640640

641+
def test_heartbeat_progress_note_on_periodic_logs_only(self, caplog):
642+
caplog.set_level(logging.INFO)
643+
message = "Generation"
644+
progress_note = "Records update only after each batch finishes."
645+
with heartbeat(message, interval=0.05, progress_note=progress_note):
646+
time.sleep(0.15)
647+
648+
assert f"{message} in progress. {progress_note}" in caplog.text
649+
assert f"{message} complete" in caplog.text
650+
for record in caplog.records:
651+
if "complete" in record.getMessage():
652+
assert progress_note not in record.getMessage()
653+
641654
def test_heartbeat_includes_extra_fields(self, caplog):
642655
caplog.set_level(logging.INFO)
643656
with heartbeat("Loading", interval=0.05, model="test-model"):

0 commit comments

Comments
 (0)