Skip to content

Commit 0566ddb

Browse files
cg505claude
andcommitted
[Jobs] Always run jobs controllers with debug logging
The jobs controller process inherits SKYPILOT_DEBUG from whichever process starts it, and that inherited level is unreliable and rarely debug: - Consolidation mode: the controller subprocess inherits the os.environ of the process that starts it. In steady state that is the managed-job refresh daemon (a thread in the main API server process), so it inherits the API server's level; when a `sky jobs launch` request worker starts it directly, it inherits the client's level. - Remote mode: it comes from the controller env file (built from the launching client's level). None of these is guaranteed to be debug, so the controller's own ("controller_system") logs are emitted at INFO level and without timestamps unless someone happened to opt in. This makes it hard to reconstruct incidents from controller logs (e.g. determining when controllers re-claim jobs during a failover/rolling upgrade). Always run the controller with debug logging enabled, independent of the inherited level, in both consolidation and remote modes: - At the controller process entry point (__main__), force SKYPILOT_DEBUG=1 and reload the logger so the controller-process ("controller_system") logs get debug detail + timestamps. Setting the env var also propagates to subprocesses spawned by the controller. - In the per-job loop, override SKYPILOT_DEBUG to '1' after applying the job's env (which carries the launching client's value), so per-job controller logs are also at debug level. Both entry points run the same `python -m sky.jobs.controller` module, so the fix covers consolidation-mode and remote controllers alike. Tested: - Repro'd in consolidation mode: controller_system log had no timestamps and no debug lines when the client did not set SKYPILOT_DEBUG. - After fix, verified end-to-end on Kubernetes in both consolidation mode and remote-controller mode: both the controller_system log and per-job logs are emitted at debug level with timestamps even when the launching client did not set SKYPILOT_DEBUG (env file still records SKYPILOT_DEBUG='0'). - Added unit tests for the per-job env override helper. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01FmmoQDXj9gwbXvS7PhXuLF
1 parent 719c50f commit 0566ddb

2 files changed

Lines changed: 95 additions & 5 deletions

File tree

sky/jobs/controller.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from sky.utils import context_utils
5050
from sky.utils import controller_utils
5151
from sky.utils import dag_utils
52+
from sky.utils import env_options
5253
from sky.utils import status_lib
5354
from sky.utils import ux_utils
5455
from sky.utils.plugin_extensions import ExternalClusterFailure
@@ -103,6 +104,25 @@ def _get_dag(job_id: int) -> 'sky.Dag':
103104
return dag
104105

105106

107+
def _apply_job_env_overrides(
108+
ctx: context.SkyPilotContext,
109+
env_vars: typing.Mapping[str, Optional[str]]) -> None:
110+
"""Apply a job's env vars to ``ctx``, always forcing debug logging on.
111+
112+
The per-job env carries the launching client's SKYPILOT_DEBUG (e.g. '0').
113+
We override it to '1' after applying the env so the controller's per-job
114+
logs always include debug detail + timestamps, regardless of the client's
115+
log level. This mirrors the controller-process-level counterpart in the
116+
``__main__`` block. These logs are essential for reconstructing incidents
117+
and the extra verbosity on the controller is an acceptable cost.
118+
"""
119+
for key, value in env_vars.items():
120+
if value is not None:
121+
ctx.override_envs({key: value})
122+
logger.debug('Set environment variable: %s=%s', key, value)
123+
ctx.override_envs({env_options.Options.SHOW_DEBUG_INFO.env_key: '1'})
124+
125+
106126
def _add_k8s_annotations(task: 'sky.Task', job_id: int) -> None:
107127
"""Adds Kubernetes pod config annotations to the task resources.
108128
@@ -2148,11 +2168,7 @@ async def run_job_loop(self,
21482168
logger.info('Loading %d environment variables for job %s',
21492169
len(env_vars), job_id)
21502170
if ctx is not None:
2151-
for key, value in env_vars.items():
2152-
if value is not None:
2153-
ctx.override_envs({key: value})
2154-
logger.debug('Set environment variable: %s=%s', key,
2155-
value)
2171+
_apply_job_env_overrides(ctx, env_vars)
21562172

21572173
# Restore config file if needed
21582174
file_content_utils.restore_job_config_file(job_id)
@@ -2534,4 +2550,24 @@ async def main(controller_uuid: str):
25342550

25352551

25362552
if __name__ == '__main__':
2553+
# Always run the jobs controller with debug logging enabled, regardless of
2554+
# whatever SKYPILOT_DEBUG it would otherwise inherit. That inherited level
2555+
# is unreliable and rarely debug:
2556+
# - Consolidation mode: the controller subprocess inherits the os.environ
2557+
# of whichever process started it. In steady state that is the
2558+
# managed-job refresh daemon (a thread in the main API server process),
2559+
# so it inherits the *API server's* level; when a `sky jobs launch`
2560+
# request worker starts it directly, it inherits the *client's* level.
2561+
# - Remote mode: it comes from the controller env file (built from the
2562+
# launching client's level).
2563+
# None of these is guaranteed to be debug, so the controller's own
2564+
# ("controller_system") logs land at INFO and without timestamps unless
2565+
# someone happened to opt in. Controller debug logs (timestamps + extra
2566+
# detail) are essential for reconstructing incidents -- e.g. determining
2567+
# when controllers re-claim jobs during failover -- and the extra verbosity
2568+
# on the controller is an acceptable cost. We set the env var (so
2569+
# subprocesses spawned by the controller inherit it) and reload the logger
2570+
# so the level and timestamped formatter take effect immediately.
2571+
os.environ[env_options.Options.SHOW_DEBUG_INFO.env_key] = '1'
2572+
sky_logging.reload_logger()
25372573
asyncio.run(main(sys.argv[1]))

tests/unit_tests/test_sky/jobs/test_controller.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
and file mount cleanup in task_cleanup().
1010
"""
1111
import asyncio
12+
import os
1213
from typing import Dict, List, Optional, Tuple
1314
from unittest.mock import AsyncMock
1415
from unittest.mock import MagicMock
@@ -17,8 +18,11 @@
1718
import pytest
1819

1920
from sky.jobs import state as managed_job_state
21+
from sky.jobs.controller import _apply_job_env_overrides
2022
from sky.jobs.controller import ControllerManager
2123
from sky.utils import common
24+
from sky.utils import context
25+
from sky.utils import env_options
2226

2327

2428
class TestNormalJobRecovery:
@@ -1077,3 +1081,53 @@ def download_side_effect(task_id, handle, job_id_on_pool):
10771081
0, mock_handle_0, None)
10781082
controller.download_log_and_stream.assert_any_call(
10791083
1, mock_handle_1, None)
1084+
1085+
1086+
class TestApplyJobEnvOverrides:
1087+
"""Tests that the controller always forces debug logging for the per-job
1088+
loop, regardless of the launching client's log level.
1089+
1090+
The per-job env carries the client's SKYPILOT_DEBUG value (e.g. '0' when
1091+
the client did not opt into debug logging). The controller must override
1092+
this to '1' so the per-job controller logs always include debug detail and
1093+
timestamps -- essential for reconstructing incidents.
1094+
"""
1095+
1096+
@pytest.fixture
1097+
def ctx(self):
1098+
context.initialize()
1099+
yield context.get()
1100+
if context.get()._log_file_handle is not None:
1101+
context.get()._log_file_handle.close()
1102+
1103+
def test_forces_debug_when_client_disabled_it(self, ctx):
1104+
"""SKYPILOT_DEBUG='0' from the client is overridden to '1'."""
1105+
debug_key = env_options.Options.SHOW_DEBUG_INFO.env_key
1106+
_apply_job_env_overrides(ctx, {debug_key: '0', 'FOO': 'bar'})
1107+
# Other env vars are passed through unchanged.
1108+
assert ctx.env_overrides['FOO'] == 'bar'
1109+
# Debug is forced on regardless of the client's value.
1110+
assert ctx.env_overrides[debug_key] == '1'
1111+
1112+
def test_forces_debug_when_client_omitted_it(self, ctx):
1113+
"""Debug is forced on even when the env has no SKYPILOT_DEBUG."""
1114+
debug_key = env_options.Options.SHOW_DEBUG_INFO.env_key
1115+
_apply_job_env_overrides(ctx, {'FOO': 'bar'})
1116+
assert ctx.env_overrides[debug_key] == '1'
1117+
1118+
def test_skips_none_values(self, ctx):
1119+
"""None values (unset in the env file) are not applied."""
1120+
_apply_job_env_overrides(ctx, {'FOO': None})
1121+
assert 'FOO' not in ctx.env_overrides
1122+
1123+
def test_debug_visible_through_contextual_environ(self, ctx):
1124+
"""With the context active, SHOW_DEBUG_INFO.get() reflects the
1125+
forced value even though the client supplied '0'."""
1126+
debug_key = env_options.Options.SHOW_DEBUG_INFO.env_key
1127+
_apply_job_env_overrides(ctx, {debug_key: '0'})
1128+
original_environ = os.environ
1129+
try:
1130+
os.environ = context.ContextualEnviron(original_environ)
1131+
assert env_options.Options.SHOW_DEBUG_INFO.get() is True
1132+
finally:
1133+
os.environ = original_environ

0 commit comments

Comments
 (0)