Skip to content

Commit 6bb08dc

Browse files
cg505claude
andcommitted
[Jobs] Always run jobs controllers with debug logging
The jobs controller process inherits the launching client's log level via SKYPILOT_DEBUG (through os.environ in consolidation mode, or the controller env file in remote mode). When the client does not opt into debug logging, the controller's own logs are emitted at INFO level and without timestamps. This makes it hard to reconstruct incidents from controller logs (e.g. determining when controllers re-claim jobs during failover). Always run the controller with debug logging enabled, independent of the client's log level, in both consolidation and remote modes: - At the controller process entry point (__main__), force SKYPILOT_DEBUG=1 and reload the logger so the controller-process ("controller_system") logs get debug detail + timestamps. Setting the env var also propagates to subprocesses spawned by the controller. - In the per-job loop, override SKYPILOT_DEBUG to '1' after applying the job's env (which carries the client's value), so per-job controller logs are also at debug level. Both entry points run the same `python -m sky.jobs.controller` module, so the fix covers consolidation-mode and remote controllers alike. Tested: - Repro'd in consolidation mode: controller_system log had no timestamps and no debug lines when the client did not set SKYPILOT_DEBUG. - After fix, verified end-to-end on Kubernetes in both consolidation mode and remote-controller mode: both the controller_system log and per-job logs are emitted at debug level with timestamps even when the launching client did not set SKYPILOT_DEBUG (env file still records SKYPILOT_DEBUG='0'). - Added unit tests for the per-job env override helper. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01FmmoQDXj9gwbXvS7PhXuLF
1 parent 719c50f commit 6bb08dc

2 files changed

Lines changed: 88 additions & 5 deletions

File tree

sky/jobs/controller.py

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from sky.utils import context_utils
5050
from sky.utils import controller_utils
5151
from sky.utils import dag_utils
52+
from sky.utils import env_options
5253
from sky.utils import status_lib
5354
from sky.utils import ux_utils
5455
from sky.utils.plugin_extensions import ExternalClusterFailure
@@ -103,6 +104,25 @@ def _get_dag(job_id: int) -> 'sky.Dag':
103104
return dag
104105

105106

107+
def _apply_job_env_overrides(
108+
ctx: context.SkyPilotContext,
109+
env_vars: typing.Mapping[str, Optional[str]]) -> None:
110+
"""Apply a job's env vars to ``ctx``, always forcing debug logging on.
111+
112+
The per-job env carries the launching client's SKYPILOT_DEBUG (e.g. '0').
113+
We override it to '1' after applying the env so the controller's per-job
114+
logs always include debug detail + timestamps, regardless of the client's
115+
log level. This mirrors the controller-process-level counterpart in the
116+
``__main__`` block. These logs are essential for reconstructing incidents
117+
and the extra verbosity on the controller is an acceptable cost.
118+
"""
119+
for key, value in env_vars.items():
120+
if value is not None:
121+
ctx.override_envs({key: value})
122+
logger.debug('Set environment variable: %s=%s', key, value)
123+
ctx.override_envs({env_options.Options.SHOW_DEBUG_INFO.env_key: '1'})
124+
125+
106126
def _add_k8s_annotations(task: 'sky.Task', job_id: int) -> None:
107127
"""Adds Kubernetes pod config annotations to the task resources.
108128
@@ -2148,11 +2168,7 @@ async def run_job_loop(self,
21482168
logger.info('Loading %d environment variables for job %s',
21492169
len(env_vars), job_id)
21502170
if ctx is not None:
2151-
for key, value in env_vars.items():
2152-
if value is not None:
2153-
ctx.override_envs({key: value})
2154-
logger.debug('Set environment variable: %s=%s', key,
2155-
value)
2171+
_apply_job_env_overrides(ctx, env_vars)
21562172

21572173
# Restore config file if needed
21582174
file_content_utils.restore_job_config_file(job_id)
@@ -2534,4 +2550,17 @@ async def main(controller_uuid: str):
25342550

25352551

25362552
if __name__ == '__main__':
2553+
# Always run the jobs controller with debug logging enabled, regardless of
2554+
# the launching client's log level. The controller process inherits the
2555+
# client's SKYPILOT_DEBUG (via os.environ in consolidation mode, or the
2556+
# controller env file in remote mode), which leaves the controller's own
2557+
# logs at INFO level and without timestamps when the client did not opt in.
2558+
# Controller debug logs (timestamps + extra detail) are essential for
2559+
# reconstructing incidents -- e.g. determining when controllers re-claim
2560+
# jobs during failover -- and the extra verbosity on the controller is an
2561+
# acceptable cost. We set the env var (so subprocesses spawned by the
2562+
# controller inherit it) and reload the logger so the level and timestamped
2563+
# formatter take effect immediately.
2564+
os.environ[env_options.Options.SHOW_DEBUG_INFO.env_key] = '1'
2565+
sky_logging.reload_logger()
25372566
asyncio.run(main(sys.argv[1]))

tests/unit_tests/test_sky/jobs/test_controller.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
and file mount cleanup in task_cleanup().
1010
"""
1111
import asyncio
12+
import os
1213
from typing import Dict, List, Optional, Tuple
1314
from unittest.mock import AsyncMock
1415
from unittest.mock import MagicMock
@@ -17,8 +18,11 @@
1718
import pytest
1819

1920
from sky.jobs import state as managed_job_state
21+
from sky.jobs.controller import _apply_job_env_overrides
2022
from sky.jobs.controller import ControllerManager
2123
from sky.utils import common
24+
from sky.utils import context
25+
from sky.utils import env_options
2226

2327

2428
class TestNormalJobRecovery:
@@ -1077,3 +1081,53 @@ def download_side_effect(task_id, handle, job_id_on_pool):
10771081
0, mock_handle_0, None)
10781082
controller.download_log_and_stream.assert_any_call(
10791083
1, mock_handle_1, None)
1084+
1085+
1086+
class TestApplyJobEnvOverrides:
1087+
"""Tests that the controller always forces debug logging for the per-job
1088+
loop, regardless of the launching client's log level.
1089+
1090+
The per-job env carries the client's SKYPILOT_DEBUG value (e.g. '0' when
1091+
the client did not opt into debug logging). The controller must override
1092+
this to '1' so the per-job controller logs always include debug detail and
1093+
timestamps -- essential for reconstructing incidents.
1094+
"""
1095+
1096+
@pytest.fixture
1097+
def ctx(self):
1098+
context.initialize()
1099+
yield context.get()
1100+
if context.get()._log_file_handle is not None:
1101+
context.get()._log_file_handle.close()
1102+
1103+
def test_forces_debug_when_client_disabled_it(self, ctx):
1104+
"""SKYPILOT_DEBUG='0' from the client is overridden to '1'."""
1105+
debug_key = env_options.Options.SHOW_DEBUG_INFO.env_key
1106+
_apply_job_env_overrides(ctx, {debug_key: '0', 'FOO': 'bar'})
1107+
# Other env vars are passed through unchanged.
1108+
assert ctx.env_overrides['FOO'] == 'bar'
1109+
# Debug is forced on regardless of the client's value.
1110+
assert ctx.env_overrides[debug_key] == '1'
1111+
1112+
def test_forces_debug_when_client_omitted_it(self, ctx):
1113+
"""Debug is forced on even when the env has no SKYPILOT_DEBUG."""
1114+
debug_key = env_options.Options.SHOW_DEBUG_INFO.env_key
1115+
_apply_job_env_overrides(ctx, {'FOO': 'bar'})
1116+
assert ctx.env_overrides[debug_key] == '1'
1117+
1118+
def test_skips_none_values(self, ctx):
1119+
"""None values (unset in the env file) are not applied."""
1120+
_apply_job_env_overrides(ctx, {'FOO': None})
1121+
assert 'FOO' not in ctx.env_overrides
1122+
1123+
def test_debug_visible_through_contextual_environ(self, ctx):
1124+
"""With the context active, SHOW_DEBUG_INFO.get() reflects the
1125+
forced value even though the client supplied '0'."""
1126+
debug_key = env_options.Options.SHOW_DEBUG_INFO.env_key
1127+
_apply_job_env_overrides(ctx, {debug_key: '0'})
1128+
original_environ = os.environ
1129+
try:
1130+
os.environ = context.ContextualEnviron(original_environ)
1131+
assert env_options.Options.SHOW_DEBUG_INFO.get() is True
1132+
finally:
1133+
os.environ = original_environ

0 commit comments

Comments
 (0)