job run: add --no-follow and fix behavior when websocket closes early (#1577)

amritghimire · dreadatour · ampcode-com · web-flow · commit 7a4a6911d3a5 · 2026-02-09T20:58:11.000+05:45
* job run: add --no-follow and fix behavior when websocket closes early Add --no-follow so CI can wait for job completion without streaming logs to the console. When --no-follow is set we still consume the log stream and only skip printing log lines and log blobs. When the log stream websocket closes before a final status we now fetch job status via REST and only show dataset versions if the job actually finished. Otherwise we print "Lost connection" and exit 1. Also fix the status check to use JobStatus.finished() and break on unknown status to avoid an infinite loop. * Update docs/commands/job/run.md Co-authored-by: Vladimir Rudnykh <dreadatour@gmail.com> * Skip ping messages * Add no follow params to studio client * Pass verbose flag through to job log streaming Switch create_job call in process_jobs_args to use keyword arguments for clarity and add the missing verbose parameter. show_logs_from_client now accepts a verbose flag and prints diagnostic messages when the job finishes, retries are exhausted, or an unknown status is encountered. This makes it easier to debug log streaming issues without needing to attach a debugger. Amp-Thread-ID: https://ampcode.com/threads/T-019c3357-1a5b-76bb-8ea5-9f3baf69cc99 Co-authored-by: Amp <amp@ampcode.com> * Fix tests * Increase coverage * studio: fix job run tests and switch verbose to logging - Fix test_studio_run_non_zero_exit_code and websocket disconnect tests: patch StudioClient.tail_job_logs where it is used (datachain.studio) so the mock is applied. Add no_follow to mock signature to match real API. Mock GET jobs with a regex so requests with query params match. - In studio.py, drop the verbose flag from create_job and show_logs_from_client; use logger.debug() for debug messages instead. - Adjust test_studio_run_invalid_job_status to assert on caplog when checking debug messages. Add tests for verbose (caplog), log blobs, _get_job_status edge cases, rest_status None, dataset versions error, and TASK status. - Add return type to _get_job_status and log on exception. * Add clarify * Fix test * Update tests/test_cli_studio.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Vladimir Rudnykh <dreadatour@gmail.com> Co-authored-by: Amp <amp@ampcode.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/docs/commands/job/run.md b/docs/commands/job/run.md
@@ -14,7 +14,7 @@ usage: datachain job run [-h] [-v] [-q] [--team TEAM] [--env-file ENV_FILE]
                          [--req-file REQ_FILE] [--req REQ [REQ ...]]
                          [--priority PRIORITY]
                          [--start-time START_TIME] [--cron CRON]
-                         [--no-wait] [--ignore-checkpoints]
+                         [--no-wait] [--no-follow] [--ignore-checkpoints]
                          file
 ```
 
@@ -43,6 +43,7 @@ This command runs a job in Studio using the specified query file. You can config
 * `--start-time START_TIME` - Time to schedule the task in YYYY-MM-DDTHH:mm format or natural language.
 * `--cron CRON` - Cron expression for the cron task.
 * `--no-wait` - Do not wait for the job to finish.
+* `--no-follow` - Do not print the job logs to the console
 * `--ignore-checkpoints` - Ignore existing checkpoints and run from scratch.
 * `-h`, `--help` - Show the help message and exit.
 * `-v`, `--verbose` - Be verbose.
@@ -155,6 +156,12 @@ datachain job run --start-time "tomorrow 3pm" --cron "0 0 * * *" query.py
 datachain job run query.py --no-wait
 ```
 
+14. Start the job and wait for completion but don't print logs
+```bash
+# Useful for CI where you just want to wait for the completion of the jobs.
+datachain job run query.py --no-follow
+```
+
 ## Notes
 
 * **Checkpoints**: Running the same script multiple times via `datachain job run` automatically links jobs together, enabling checkpoint reuse. If a previous run of the same script (by absolute path) exists, DataChain will resume from where it left off.
diff --git a/src/datachain/cli/parser/job.py b/src/datachain/cli/parser/job.py
@@ -122,6 +122,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         action="store_true",
         help="Do not wait for the job to finish",
     )
+    studio_run_parser.add_argument(
+        "--no-follow",
+        action="store_true",
+        help="Do not print the job logs to the console",
+    )
     studio_run_parser.add_argument(
         "--ignore-checkpoints",
         action="store_true",
diff --git a/src/datachain/remote/studio.py b/src/datachain/remote/studio.py
@@ -297,7 +297,9 @@ def _unpacker_hook(code, data):
 
         return msgpack.ExtType(code, data)
 
-    async def tail_job_logs(self, job_id: str) -> AsyncIterator[dict]:
+    async def tail_job_logs(
+        self, job_id: str, no_follow: bool = False
+    ) -> AsyncIterator[dict]:
         """
         Follow job logs via websocket connection.
 
@@ -312,6 +314,8 @@ async def tail_job_logs(self, job_id: str) -> AsyncIterator[dict]:
             parsed_url._replace(scheme="wss" if parsed_url.scheme == "https" else "ws")
         )
         ws_url = f"{ws_url}/logs/follow/?job_id={job_id}&team_name={self.team}"
+        if no_follow:
+            ws_url += "&no_follow=true"
 
         async with websockets.connect(
             ws_url,
@@ -321,7 +325,8 @@ async def tail_job_logs(self, job_id: str) -> AsyncIterator[dict]:
                 try:
                     message = await websocket.recv()
                     data = json.loads(message)
-
+                    if data.get("type") == "ping":
+                        continue
                     # Yield the parsed message data
                     yield data
 
diff --git a/src/datachain/studio.py b/src/datachain/studio.py
@@ -1,4 +1,5 @@
 import asyncio
+import logging
 import os
 import sys
 import warnings
@@ -20,6 +21,8 @@
 from datachain.remote.studio import StudioClient
 from datachain.utils import STUDIO_URL, flatten
 
+logger = logging.getLogger("datachain")
+
 if TYPE_CHECKING:
     from argparse import Namespace
 
@@ -43,23 +46,24 @@ def process_jobs_args(args: "Namespace"):
 
     if args.cmd == "run":
         return create_job(
-            args.file,
-            args.team,
-            args.env_file,
-            args.env,
-            args.workers,
-            args.files,
-            args.python_version,
-            args.repository,
-            args.req,
-            args.req_file,
-            args.priority,
-            args.cluster,
-            args.start_time,
-            args.cron,
-            args.no_wait,
-            args.credentials_name,
-            args.ignore_checkpoints,
+            query_file=args.file,
+            team_name=args.team,
+            env_file=args.env_file,
+            env=args.env,
+            workers=args.workers,
+            files=args.files,
+            python_version=args.python_version,
+            repository=args.repository,
+            req=args.req,
+            req_file=args.req_file,
+            priority=args.priority,
+            cluster=args.cluster,
+            start_time=args.start_time,
+            cron=args.cron,
+            no_wait=args.no_wait,
+            credentials_name=args.credentials_name,
+            ignore_checkpoints=args.ignore_checkpoints,
+            no_follow=args.no_follow,
         )
 
     if args.cmd == "cancel":
@@ -366,21 +370,33 @@ async def _show_log_blobs(log_blobs: list[str], client):
             print("\n>>>> Warning: Failed to fetch logs from studio")
 
 
-def show_logs_from_client(client, job_id):
+def _get_job_status(client, job_id: str) -> str | None:
+    try:
+        response = client.get_jobs(job_id=job_id)
+        if response.ok and response.data and len(response.data) > 0:
+            return response.data[0].get("status")
+    except (requests.RequestException, OSError, KeyError):
+        logger.debug("Failed to get job status: %s", job_id)
+    return None
+
+
+def show_logs_from_client(  # noqa: C901
+    client, job_id: str, no_follow: bool = False
+):
     async def _run():
         retry_count = 0
         latest_status = None
         processed_statuses = set()
         log_blobs_processed = False
         while True:
-            async for message in client.tail_job_logs(job_id):
-                if "log_blobs" in message:
+            async for message in client.tail_job_logs(job_id, no_follow=no_follow):
+                if "log_blobs" in message and not no_follow:
                     log_blobs = message.get("log_blobs", [])
                     if log_blobs and not log_blobs_processed:
                         log_blobs_processed = True
                         await _show_log_blobs(log_blobs, client)
 
-                elif "logs" in message:
+                elif "logs" in message and not no_follow:
                     for log in message["logs"]:
                         print(log["message"], end="")
                 elif "job" in message:
@@ -390,20 +406,41 @@ async def _run():
                     processed_statuses.add(latest_status)
                     print(f"\n>>>> Job is now in {latest_status} status.")
 
+            # After websocket closes, check actual job status via REST
+            rest_status = _get_job_status(client, job_id)
+            if rest_status and rest_status != latest_status:
+                print(f"\n>>>> Job is now in {rest_status} status.")
+            if rest_status:
+                latest_status = rest_status
+
             try:
-                if retry_count > RETRY_MAX_TIMES or (
-                    latest_status and JobStatus[latest_status].finished()
-                ):
+                if latest_status and JobStatus[latest_status] in JobStatus.finished():
+                    logger.debug("Job is in finished status: %s", latest_status)
+                    break
+                if retry_count > RETRY_MAX_TIMES:
+                    logger.debug("Max retry count reached: %s", retry_count)
                     break
                 await asyncio.sleep(RETRY_SLEEP_SEC)
                 retry_count += 1
             except KeyError:
-                pass
+                break
 
         return latest_status
 
     final_status = asyncio.run(_run())
 
+    try:
+        job_finished = final_status and JobStatus[final_status] in JobStatus.finished()
+    except KeyError:
+        logger.debug("Job status is not a valid status: %s", final_status)
+        job_finished = False
+
+    if not job_finished:
+        logger.debug("Job is not finished: %s.", final_status or "unknown")
+        print(f"\n>>>> Lost connection. Job status: {final_status or 'unknown'}.")
+        return 1
+
+    # Show dataset versions only for finished jobs
     response = client.dataset_job_versions(job_id)
     if not response.ok:
         raise DataChainError(response.message)
@@ -417,11 +454,13 @@ async def _run():
     else:
         print("\n\nNo dataset versions created during the job.")
 
-    exit_code_by_status = {
-        "FAILED": 1,
-        "CANCELED": 2,
-    }
-    return exit_code_by_status.get(final_status.upper(), 0) if final_status else 0
+    if final_status.upper() == "COMPLETE":
+        return 0
+    if final_status.upper() == "FAILED":
+        return 1
+    if final_status.upper() == "CANCELED":
+        return 2
+    return 0
 
 
 def create_job(  # noqa: PLR0913
@@ -442,6 +481,7 @@ def create_job(  # noqa: PLR0913
     no_wait: bool | None = False,
     credentials_name: str | None = None,
     ignore_checkpoints: bool = False,
+    no_follow: bool = False,
 ):
     catalog = get_catalog()
 
@@ -532,7 +572,13 @@ def create_job(  # noqa: PLR0913
     print("Open the job in Studio at", job_data.get("url"))
     print("=" * 40)
 
-    return 0 if no_wait else show_logs_from_client(client, job_id)
+    return (
+        0
+        if no_wait
+        else show_logs_from_client(
+            client=client, job_id=str(job_id), no_follow=no_follow
+        )
+    )
 
 
 def upload_files(client: StudioClient, files: list[str]) -> list[str]:
diff --git a/tests/test_cli_studio.py b/tests/test_cli_studio.py