Skip to content

Commit fc9dc5b

Browse files
rjpowerclaude
andauthored
[iris] Replace heartbeat-based logging with push-based LogService (#4274)
Add a LogService (PushLogs/QueryLogs RPCs) co-hosted on the controller. Workers push task and process logs directly via RPC instead of piggybacking on heartbeat responses. Removes worker-local LogStore, FetchLogs RPC from WorkerService, and all heartbeat log_entries plumbing. K8s provider pushes logs to the controller LogStore directly. Controller process logs keyed as /system/controller, worker process logs as /system/worker/<id>. Requires full cluster restart. --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 49d880c commit fc9dc5b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1236
-675
lines changed

lib/iris/dashboard/src/components/shared/LogViewer.vue

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<script setup lang="ts">
22
import { ref, computed, onMounted, watch } from 'vue'
3-
import { useControllerRpc, useWorkerRpc } from '@/composables/useRpc'
3+
import { useLogServiceRpc } from '@/composables/useRpc'
44
import { useAutoRefresh } from '@/composables/useAutoRefresh'
55
import type { FetchLogsResponse, LogEntry, TaskAttempt } from '@/types/rpc'
66
import { timestampMs, logLevelClass, formatLogTime } from '@/utils/formatting'
@@ -34,8 +34,8 @@ function levelPriority(lvl: string | undefined): number {
3434
return LOG_LEVEL_PRIORITY[lvl.toLowerCase()] ?? 1
3535
}
3636
37-
// Choose the right RPC based on what we're viewing
38-
const useRpc = props.source === 'worker' ? useWorkerRpc : useControllerRpc
37+
// FetchLogs is served by the LogService (co-hosted on the controller)
38+
const useRpc = useLogServiceRpc
3939
4040
// Task IDs end with a numeric segment (e.g. /alice/job/0), job IDs don't.
4141
const isTask = props.taskId ? /\/\d+$/.test(props.taskId) : false
@@ -54,7 +54,7 @@ const taskLogState = props.taskId
5454
5555
const processLogState = !props.taskId
5656
? useRpc<FetchLogsResponse>('FetchLogs', () => ({
57-
source: props.workerId ? `/worker/${props.workerId}` : '/system/process',
57+
source: props.workerId ? `/system/worker/${props.workerId}` : '/system/controller',
5858
maxLines: tailLines.value || undefined,
5959
tail: true,
6060
}))

lib/iris/dashboard/src/composables/useRpc.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,26 @@ export async function controllerRpcCall<T>(method: string, body?: Record<string,
9292
return resp.json() as Promise<T>
9393
}
9494

95+
/** RPC composable for LogService endpoints. */
96+
export function useLogServiceRpc<T>(
97+
method: string,
98+
body?: RpcBody,
99+
): RpcState<T> {
100+
return useRpc<T>('iris.logging.LogService', method, body)
101+
}
102+
103+
/** One-shot RPC call for LogService. */
104+
export async function logServiceRpcCall<T>(method: string, body?: Record<string, unknown>): Promise<T> {
105+
const resp = await fetch(`/iris.logging.LogService/${method}`, {
106+
method: 'POST',
107+
headers: { 'Content-Type': 'application/json' },
108+
body: JSON.stringify(body ?? {}),
109+
})
110+
handleUnauthorized(resp)
111+
if (!resp.ok) throw new Error(`${method}: ${resp.status} ${resp.statusText}`)
112+
return resp.json() as Promise<T>
113+
}
114+
95115
export async function workerRpcCall<T>(method: string, body?: Record<string, unknown>): Promise<T> {
96116
const resp = await fetch(`/iris.cluster.WorkerService/${method}`, {
97117
method: 'POST',

lib/iris/src/iris/cli/bug_report.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@
1313

1414
from iris.cluster.log_store import build_log_source
1515
from iris.cluster.types import JobName
16-
from iris.rpc import cluster_pb2
16+
from iris.rpc import cluster_pb2, logging_pb2
1717
from iris.rpc.auth import AuthTokenInjector, TokenProvider
1818
from iris.rpc.cluster_connect import ControllerServiceClientSync
19+
from iris.rpc.logging_connect import LogServiceClientSync
1920
from iris.time_proto import timestamp_from_proto
2021

2122
logger = logging.getLogger(__name__)
@@ -105,14 +106,17 @@ def gather_bug_report(
105106
"""Gather all diagnostic data for a job into a BugReport."""
106107
interceptors = [AuthTokenInjector(token_provider)] if token_provider else []
107108
client = ControllerServiceClientSync(controller_url, timeout_ms=30000, interceptors=interceptors)
109+
log_client = LogServiceClientSync(controller_url, timeout_ms=30000, interceptors=interceptors)
108110
try:
109-
return _gather(client, job_id, tail=tail)
111+
return _gather(client, log_client, job_id, tail=tail)
110112
finally:
113+
log_client.close()
111114
client.close()
112115

113116

114117
def _gather(
115118
client: ControllerServiceClientSync,
119+
log_client: LogServiceClientSync,
116120
job_id: JobName,
117121
*,
118122
tail: int,
@@ -141,8 +145,8 @@ def _gather(
141145
try:
142146
# Fetch all attempts for this task, taking only the last `tail` lines.
143147
source = build_log_source(JobName.from_wire(task.task_id))
144-
log_resp = client.fetch_logs(
145-
cluster_pb2.FetchLogsRequest(
148+
log_resp = log_client.fetch_logs(
149+
logging_pb2.FetchLogsRequest(
146150
source=source,
147151
max_lines=tail,
148152
tail=True,

lib/iris/src/iris/cli/process_status.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@
1313
import humanfriendly
1414

1515
from iris.cli.main import require_controller_url, rpc_client
16-
from iris.rpc import cluster_pb2
16+
from iris.rpc import cluster_pb2, logging_pb2
17+
from iris.rpc.logging_connect import LogServiceClientSync
1718

18-
_CONTROLLER_TARGET = "/system/process"
19+
_CONTROLLER_LOG_TARGET = "/system/controller"
1920

2021

2122
def _print_status(resp: cluster_pb2.GetProcessStatusResponse, label: str) -> None:
@@ -81,36 +82,36 @@ def logs(ctx, target: str | None, level: str, follow: bool, max_lines: int, subs
8182
from datetime import datetime, timezone
8283

8384
url = require_controller_url(ctx)
84-
source = target or _CONTROLLER_TARGET
85-
86-
with rpc_client(url) as client:
87-
cursor = 0
88-
first = True
89-
while True:
90-
req = cluster_pb2.FetchLogsRequest(
91-
source=source,
92-
max_lines=max_lines if first else 100,
93-
tail=first,
94-
min_level=level,
95-
cursor=cursor if not first else 0,
96-
)
97-
if substring:
98-
req.substring = substring
85+
log_client = LogServiceClientSync(url)
86+
source = target or _CONTROLLER_LOG_TARGET
87+
88+
cursor = 0
89+
first = True
90+
while True:
91+
req = logging_pb2.FetchLogsRequest(
92+
source=source,
93+
max_lines=max_lines if first else 100,
94+
tail=first,
95+
min_level=level,
96+
cursor=cursor if not first else 0,
97+
)
98+
if substring:
99+
req.substring = substring
99100

100-
resp = client.fetch_logs(req)
101-
for entry in resp.entries:
102-
ts = ""
103-
if entry.timestamp and entry.timestamp.epoch_ms:
104-
dt = datetime.fromtimestamp(entry.timestamp.epoch_ms / 1000, tz=timezone.utc)
105-
ts = dt.strftime("%H:%M:%S")
106-
click.echo(f"[{ts}] {entry.data}")
101+
resp = log_client.fetch_logs(req)
102+
for entry in resp.entries:
103+
ts = ""
104+
if entry.timestamp and entry.timestamp.epoch_ms:
105+
dt = datetime.fromtimestamp(entry.timestamp.epoch_ms / 1000, tz=timezone.utc)
106+
ts = dt.strftime("%H:%M:%S")
107+
click.echo(f"[{ts}] {entry.data}")
107108

108-
cursor = resp.cursor
109-
first = False
109+
cursor = resp.cursor
110+
first = False
110111

111-
if not follow:
112-
break
113-
time.sleep(2)
112+
if not follow:
113+
break
114+
time.sleep(2)
114115

115116

116117
@process_group.command()
@@ -139,7 +140,7 @@ def profile(
139140
/system/worker/<id> for a worker, /alice/job/0 for a task container.
140141
"""
141142
url = require_controller_url(ctx)
142-
rpc_target = target or _CONTROLLER_TARGET
143+
rpc_target = target or ""
143144
label = target or "Controller"
144145

145146
if profiler == "threads":

lib/iris/src/iris/cluster/client/protocol.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import Protocol
77

88
from iris.cluster.types import Entrypoint, JobName, TaskAttempt
9-
from iris.rpc import cluster_pb2
9+
from iris.rpc import cluster_pb2, logging_pb2
1010
from rigging.timing import Duration
1111

1212

@@ -91,7 +91,7 @@ def fetch_logs(
9191
substring: str = "",
9292
min_level: str = "",
9393
tail: bool = False,
94-
) -> cluster_pb2.FetchLogsResponse: ...
94+
) -> logging_pb2.FetchLogsResponse: ...
9595

9696
def get_autoscaler_status(self) -> cluster_pb2.Controller.GetAutoscalerStatusResponse: ...
9797

lib/iris/src/iris/cluster/client/remote_client.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@
1818
from iris.cluster.log_store._types import build_log_source
1919
from iris.cluster.runtime.entrypoint import build_runtime_entrypoint
2020
from iris.cluster.types import Entrypoint, EnvironmentSpec, JobName, TaskAttempt, adjust_tpu_replicas, is_job_finished
21-
from iris.rpc import cluster_pb2
21+
from iris.rpc import cluster_pb2, logging_pb2
2222
from iris.rpc.cluster_connect import ControllerServiceClientSync
23+
from iris.rpc.logging_connect import LogServiceClientSync
2324
from iris.rpc.errors import call_with_retry, format_connect_error, poll_with_retries
2425
from iris.time_proto import duration_to_proto
2526
from rigging.timing import Deadline, Duration, ExponentialBackoff
@@ -65,6 +66,11 @@ def __init__(
6566
timeout_ms=timeout_ms,
6667
interceptors=interceptors,
6768
)
69+
self._log_client = LogServiceClientSync(
70+
address=controller_address,
71+
timeout_ms=timeout_ms,
72+
interceptors=interceptors,
73+
)
6874

6975
def submit_job(
7076
self,
@@ -343,6 +349,7 @@ def _call():
343349

344350
def shutdown(self, wait: bool = True) -> None:
345351
del wait
352+
self._log_client.close()
346353
self._client.close()
347354

348355
def get_task_status(self, task_name: JobName) -> cluster_pb2.TaskStatus:
@@ -390,8 +397,8 @@ def fetch_logs(
390397
substring: str = "",
391398
min_level: str = "",
392399
tail: bool = False,
393-
) -> cluster_pb2.FetchLogsResponse:
394-
request = cluster_pb2.FetchLogsRequest(
400+
) -> logging_pb2.FetchLogsResponse:
401+
request = logging_pb2.FetchLogsRequest(
395402
source=source,
396403
since_ms=since_ms,
397404
cursor=cursor,
@@ -402,7 +409,7 @@ def fetch_logs(
402409
)
403410

404411
def _call():
405-
return self._client.fetch_logs(request)
412+
return self._log_client.fetch_logs(request)
406413

407414
return call_with_retry(f"fetch_logs({source})", _call)
408415

lib/iris/src/iris/cluster/controller/controller.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
from iris.cluster.controller.dashboard import ControllerDashboard
7474
from iris.cluster.providers.k8s.tasks import K8sTaskProvider
7575
from iris.cluster.controller.provider import TaskProvider
76+
from iris.cluster.controller.worker_provider import WorkerProvider
7677
from iris.cluster.controller.scheduler import (
7778
JobRequirements,
7879
Scheduler,
@@ -94,7 +95,8 @@
9495
ReservationClaim,
9596
SchedulingEvent,
9697
)
97-
from iris.cluster.log_store import PROCESS_LOG_KEY, LogStore, LogStoreHandler
98+
from iris.cluster.log_store import CONTROLLER_LOG_KEY, LogStoreHandler
99+
from iris.log_server.server import LogServiceImpl
98100
from iris.cluster.types import (
99101
JobName,
100102
WorkerStatus,
@@ -105,7 +107,7 @@
105107
)
106108
from rigging.log_setup import slow_log
107109
from iris.managed_thread import ManagedThread, ThreadContainer, get_thread_container
108-
from iris.rpc import cluster_pb2
110+
from iris.rpc import cluster_pb2, logging_pb2
109111
from iris.rpc.auth import TokenVerifier
110112
from rigging.timing import Duration, ExponentialBackoff, RateLimiter, Timer, Timestamp, TokenBucket
111113

@@ -123,6 +125,21 @@
123125
_HISTORY_CLEANUP_INTERVAL_S = 60.0
124126

125127

128+
class _InProcessLogPusher:
129+
"""Adapts LogServiceImpl to the LogPusherProtocol for in-process use.
130+
131+
Avoids a network round-trip when the K8s provider is co-hosted with
132+
the controller: calls push_logs() directly on the service impl.
133+
"""
134+
135+
def __init__(self, log_service: LogServiceImpl) -> None:
136+
self._log_service = log_service
137+
138+
def push(self, key: str, entries: list[logging_pb2.LogEntry]) -> None:
139+
if entries:
140+
self._log_service.push_logs(logging_pb2.PushLogsRequest(key=key, entries=entries), ctx=None)
141+
142+
126143
class SchedulingOutcome(enum.Enum):
127144
"""Result of a scheduling cycle, used to drive adaptive backoff."""
128145

@@ -998,19 +1015,25 @@ def __init__(
9981015
self._db = db
9991016
else:
10001017
self._db = ControllerDB(db_dir=config.local_state_dir / "db")
1001-
self._log_store = LogStore(
1018+
1019+
self._log_service = LogServiceImpl(
10021020
log_dir=config.local_state_dir / "logs",
10031021
remote_log_dir=f"{config.remote_state_dir.rstrip('/')}/logs",
10041022
)
10051023

1006-
# Wire log store into the K8s provider so its LogCollector can write logs directly.
1007-
# Collectors are created lazily on first sync(), so just setting the field is enough.
1024+
# Wire an in-process log pusher into providers so log entries are
1025+
# forwarded through the LogService without a network hop.
1026+
# - K8sTaskProvider: its LogCollector pushes logs directly.
1027+
# - WorkerProvider: forwards log_entries piggybacked on heartbeat
1028+
# responses from old workers that predate push-based logging.
1029+
in_process_log_pusher = _InProcessLogPusher(self._log_service)
10081030
if isinstance(self._provider, K8sTaskProvider):
1009-
self._provider.log_store = self._log_store
1031+
self._provider.log_pusher = in_process_log_pusher
1032+
elif isinstance(self._provider, WorkerProvider):
1033+
self._provider.log_pusher = in_process_log_pusher
10101034

10111035
self._transitions = ControllerTransitions(
10121036
db=self._db,
1013-
log_store=self._log_store,
10141037
heartbeat_failure_threshold=config.heartbeat_failure_threshold,
10151038
user_budget_defaults=config.user_budget_defaults,
10161039
)
@@ -1023,20 +1046,23 @@ def __init__(
10231046
self._db,
10241047
controller=self,
10251048
bundle_store=self._bundle_store,
1026-
log_store=self._log_store,
1049+
log_service=self._log_service,
10271050
auth=config.auth,
1051+
system_endpoints={},
10281052
)
10291053
self._dashboard = ControllerDashboard(
10301054
self._service,
1055+
log_service=self._log_service,
10311056
host=config.host,
10321057
port=config.port,
10331058
auth_verifier=config.auth_verifier,
10341059
auth_provider=config.auth_provider,
10351060
auth_optional=config.auth.optional if config.auth else False,
10361061
)
10371062

1038-
# Ingest process logs into the LogStore so they are available via FetchLogs.
1039-
self._log_store_handler = LogStoreHandler(self._log_store, key=PROCESS_LOG_KEY)
1063+
# Ingest controller process logs into the LogStore via LogStoreHandler.
1064+
# This writes directly to the co-hosted LogStore (no RPC round-trip).
1065+
self._log_store_handler = LogStoreHandler(self._log_service.log_store, key=CONTROLLER_LOG_KEY)
10401066
self._log_store_handler.setLevel(logging.DEBUG)
10411067
self._log_store_handler.setFormatter(logging.Formatter("%(asctime)s %(name)s %(message)s"))
10421068
logging.getLogger("iris").addHandler(self._log_store_handler)
@@ -1144,6 +1170,12 @@ def start(self) -> None:
11441170
timeout=Duration.from_seconds(5.0),
11451171
)
11461172

1173+
# Register system endpoints. The address here is used for in-process
1174+
# and local-network callers (e.g. CLI, tests). Remote workers fall back
1175+
# to their configured controller_address when the resolved endpoint
1176+
# is unreachable, since the log service is co-hosted on the controller.
1177+
self._service._system_endpoints["/system/log-server"] = self.url
1178+
11471179
def stop(self) -> None:
11481180
"""Stop all background components gracefully.
11491181
@@ -1183,7 +1215,7 @@ def stop(self) -> None:
11831215
# sqlite3.ProgrammingError spam from late log records.
11841216
logging.getLogger("iris").removeHandler(self._log_store_handler)
11851217
self._log_store_handler.close()
1186-
self._log_store.close()
1218+
self._log_service.close()
11871219
self._db.close()
11881220
self._bundle_store.close()
11891221

0 commit comments

Comments
 (0)