diff --git a/lib/iris/examples/marin-dev.yaml b/lib/iris/examples/marin-dev.yaml index 1fa0bc8a85..31cfeb2759 100644 --- a/lib/iris/examples/marin-dev.yaml +++ b/lib/iris/examples/marin-dev.yaml @@ -30,6 +30,7 @@ worker_provider: {} controller: image: ghcr.io/marin-community/iris-controller:latest + use_split_heartbeat: true # try out split heartbeats on dev gcp: service_account: iris-controller@hai-gcp-models.iam.gserviceaccount.com zone: us-central1-a diff --git a/lib/iris/examples/marin.yaml b/lib/iris/examples/marin.yaml index a80dda4a6f..c61fdeab62 100644 --- a/lib/iris/examples/marin.yaml +++ b/lib/iris/examples/marin.yaml @@ -28,6 +28,7 @@ worker_provider: {} controller: image: ghcr.io/marin-community/iris-controller:latest + use_split_heartbeat: false # Stay on legacy heartbeat path until split mode burns in elsewhere. gcp: service_account: iris-controller@hai-gcp-models.iam.gserviceaccount.com zone: us-central1-a diff --git a/lib/iris/pyproject.toml b/lib/iris/pyproject.toml index cbe17fee04..2cf88fb68b 100644 --- a/lib/iris/pyproject.toml +++ b/lib/iris/pyproject.toml @@ -81,7 +81,8 @@ packages = ["src/iris"] [tool.pytest.ini_options] timeout = 10 -addopts = "-n auto --durations=25 -m 'not slow and not docker' -v" +timeout_method = "thread" +addopts = "-n auto --durations=25 -m 'not slow and not docker and not e2e' -v" markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", "docker: marks tests requiring Docker runtime (slow, needs daemon)", diff --git a/lib/iris/src/iris/cluster/controller/controller.py b/lib/iris/src/iris/cluster/controller/controller.py index f73378718a..a1ea0d56f1 100644 --- a/lib/iris/src/iris/cluster/controller/controller.py +++ b/lib/iris/src/iris/cluster/controller/controller.py @@ -6,6 +6,7 @@ import atexit import enum import logging +import queue as queue_mod import sys import tempfile import threading @@ -99,8 +100,10 @@ ControllerTransitions, DIRECT_PROVIDER_PROMOTION_RATE, HeartbeatAction, + HeartbeatApplyRequest, ReservationClaim, SchedulingEvent, + TaskUpdate, ) from iris.cluster.log_store import CONTROLLER_LOG_KEY from iris.cluster.providers.types import find_free_port, resolve_external_host @@ -146,6 +149,18 @@ class SchedulingOutcome(enum.Enum): ASSIGNMENTS_MADE = "assignments_made" +def _drain_queue(q: queue_mod.Queue, timeout: float = 1.0) -> list: + """Drain all items from queue, blocking up to timeout for the first item.""" + items: list = [] + try: + items.append(q.get(timeout=timeout)) + while True: + items.append(q.get_nowait()) + except queue_mod.Empty: + pass + return items + + _HEALTH_SUMMARY_INTERVAL = RateLimiter(interval_seconds=30) # Log a detailed per-phase scheduling trace every this many rounds. @@ -948,6 +963,13 @@ class ControllerConfig: dry_run: bool = False """Start in dry-run mode: compute scheduling but suppress all side effects.""" + use_split_heartbeat: bool = True + """When True (default), use direct StartTasks/StopTasks RPCs instead of the + dispatch_queue. Scheduling sends StartTasks immediately after committing + assignments; kills send StopTasks directly. A task-updater thread applies + state transitions from a queue fed by poll results and RPC failures. + Set False to fall back to the legacy monolithic Heartbeat path.""" + user_budget_defaults: UserBudgetDefaults = field(default_factory=UserBudgetDefaults) """Default budget settings applied when a new user is first seen.""" @@ -1105,6 +1127,10 @@ def __init__( self._autoscaler_thread: ManagedThread | None = None self._profile_thread: ManagedThread | None = None self._prune_thread: ManagedThread | None = None + self._task_updater_thread: ManagedThread | None = None + self._ping_thread: ManagedThread | None = None + self._poll_thread: ManagedThread | None = None + self._task_update_queue: queue_mod.Queue[HeartbeatApplyRequest] = queue_mod.Queue() self._autoscaler: Autoscaler | None = autoscaler @@ -1194,6 +1220,14 @@ def start(self) -> None: if isinstance(self._provider, K8sTaskProvider): self._heartbeat_thread = self._threads.spawn(self._run_direct_provider_loop, name="provider-loop") + elif self._config.use_split_heartbeat: + self._scheduling_thread = self._threads.spawn(self._run_scheduling_loop, name="scheduling-loop") + self._ping_thread = self._threads.spawn(self._run_ping_loop, name="ping-loop") + self._task_updater_thread = self._threads.spawn(self._run_task_updater_loop, name="task-updater-loop") + self._poll_thread = self._threads.spawn(self._run_poll_loop, name="poll-loop") + if not self._config.dry_run: + self._profile_thread = self._threads.spawn(self._run_profile_loop, name="profile-loop") + self._prune_thread = self._threads.spawn(self._run_prune_loop, name="prune-loop") else: self._scheduling_thread = self._threads.spawn(self._run_scheduling_loop, name="scheduling-loop") self._heartbeat_thread = self._threads.spawn(self._run_provider_loop, name="provider-loop") @@ -1261,6 +1295,15 @@ def stop(self) -> None: if self._heartbeat_thread: self._heartbeat_thread.stop() self._heartbeat_thread.join(timeout=join_timeout) + if self._ping_thread: + self._ping_thread.stop() + self._ping_thread.join(timeout=join_timeout) + if self._task_updater_thread: + self._task_updater_thread.stop() + self._task_updater_thread.join(timeout=join_timeout) + if self._poll_thread: + self._poll_thread.stop() + self._poll_thread.join(timeout=join_timeout) if self._prune_thread: self._prune_thread.stop() self._prune_thread.join(timeout=join_timeout) @@ -1908,8 +1951,12 @@ def _run_scheduler_pass( len(result.assignments), ) if all_assignments: - with slow_log(logger, "buffer_assignments", threshold_ms=200): - self._buffer_assignments(all_assignments) + if self._config.use_split_heartbeat: + with slow_log(logger, "dispatch_assignments_direct", threshold_ms=200): + self._dispatch_assignments_direct(all_assignments) + else: + with slow_log(logger, "buffer_assignments", threshold_ms=200): + self._buffer_assignments(all_assignments) logger.debug( "Scheduling cycle: %d assignments (%d preferred, %d normal), %dms (state read: %dms)", len(all_assignments), @@ -2066,16 +2113,17 @@ def kill_tasks_on_workers( task_ids: set[JobName], task_kill_workers: dict[JobName, WorkerId] | None = None, ) -> None: - """Buffer kill requests for delivery via next heartbeat. + """Kill tasks on their assigned workers. - Called after state has marked tasks as killed. For each task that had - a worker assigned, buffers the kill request for delivery via the next - heartbeat to that worker. Tasks without a worker assignment are routed - to the direct kill queue when a K8sTaskProvider is configured. + In split heartbeat mode, sends StopTasks RPCs directly. Otherwise, + buffers kill requests for delivery via next heartbeat. """ if self._config.dry_run: logger.info("[DRY-RUN] Would kill %d tasks on workers: %s", len(task_ids), list(task_ids)[:5]) return + if self._config.use_split_heartbeat and not isinstance(self._provider, K8sTaskProvider): + self._stop_tasks_direct(task_ids, task_kill_workers) + return any_buffered = False mapping = dict(task_kill_workers or {}) unresolved = task_ids - set(mapping.keys()) @@ -2100,6 +2148,232 @@ def kill_tasks_on_workers( if any_buffered: self._heartbeat_event.set() + # ========================================================================= + # Split Heartbeat Mode + # ========================================================================= + + def _dispatch_assignments_direct( + self, + assignments: list[tuple[JobName, WorkerId]], + ) -> None: + """Commit assignments and send StartTasks RPCs directly.""" + if self._config.dry_run: + for task_id, worker_id in assignments: + logger.info("[DRY-RUN] Would assign task %s to worker %s", task_id, worker_id) + return + command = [Assignment(task_id=task_id, worker_id=worker_id) for task_id, worker_id in assignments] + result = self._transitions.queue_assignments(command, direct_dispatch=True) + + # Group StartTasks payloads by (worker_id, address) + by_worker: dict[tuple[WorkerId, str], list[job_pb2.RunTaskRequest]] = {} + for worker_id, address, run_request in result.start_requests: + by_worker.setdefault((worker_id, address), []).append(run_request) + + attempt_by_worker_task = { + (worker_id, t.task_id): t.attempt_id for (worker_id, _), tasks in by_worker.items() for t in tasks + } + jobs = [(worker_id, address, tasks) for (worker_id, address), tasks in by_worker.items()] + for worker_id, response, error in self._provider.start_tasks(jobs): + if error is not None: + logger.warning("StartTasks RPC failed for worker %s: %s", worker_id, error) + continue + assert response is not None + for ack in response.acks: + if not ack.accepted: + logger.warning("Worker %s rejected task %s: %s", worker_id, ack.task_id, ack.error) + self._task_update_queue.put( + HeartbeatApplyRequest( + worker_id=worker_id, + worker_resource_snapshot=None, + updates=[ + TaskUpdate( + task_id=JobName.from_wire(ack.task_id), + attempt_id=attempt_by_worker_task.get((worker_id, ack.task_id), -1), + new_state=job_pb2.TASK_STATE_WORKER_FAILED, + error=f"Worker rejected task: {ack.error}", + ) + ], + ) + ) + + def _stop_tasks_direct( + self, + task_ids: set[JobName], + task_kill_workers: dict[JobName, WorkerId] | None = None, + ) -> None: + """Send StopTasks RPCs directly to workers.""" + mapping = dict(task_kill_workers or {}) + unresolved = task_ids - set(mapping.keys()) + if unresolved: + mapping.update(_task_worker_mapping(self._db, unresolved)) + workers = _workers_by_id(self._db, set(mapping.values())) + + by_worker: dict[tuple[WorkerId, str], list[str]] = {} + for task_id, worker_id in mapping.items(): + worker = workers.get(worker_id) + if worker is None: + continue + by_worker.setdefault((worker_id, worker.address), []).append(task_id.to_wire()) + + jobs = [(worker_id, address, wids) for (worker_id, address), wids in by_worker.items()] + for worker_id, error in self._provider.stop_tasks(jobs): + if error is not None: + logger.warning("StopTasks RPC failed for worker %s: %s", worker_id, error) + + def _get_active_worker_addresses(self) -> list[tuple[WorkerId, str | None]]: + """Get healthy active workers as (worker_id, address) tuples for ping.""" + workers = healthy_active_workers_with_attributes(self._db) + return [(w.worker_id, w.address) for w in workers] + + def _run_ping_loop(self, stop_event: threading.Event) -> None: + """Fast ping loop for liveness detection. + + Sends Ping RPCs to all healthy workers every heartbeat_interval. Tracks + consecutive failures in-memory. When threshold is exceeded, removes the + worker and cascades task failures. + + Both the ping loop and provider loop (when active) may race to fail a + worker. This is safe: fail_workers_batch, on_worker_failed, and + terminate_slices_for_workers are all idempotent. + """ + ping_interval_s = self._config.heartbeat_interval.to_seconds() + limiter = RateLimiter(interval_seconds=ping_interval_s) + ping_failures: dict[str, int] = {} + threshold = self._config.heartbeat_failure_threshold + # Refresh resource snapshots every ~60s; other cycles just note liveness. + resource_update_every = max(1, round(60.0 / ping_interval_s)) + cycle = 0 + + while not stop_event.is_set(): + if not limiter.wait(cancel=stop_event): + break + if self._checkpoint_paused.is_set(): + continue + try: + self._reap_stale_workers() + workers = self._get_active_worker_addresses() + results = self._provider.ping_workers(workers) + update_resources = cycle % resource_update_every == 0 + cycle += 1 + + dead_workers: list[str] = [] + liveness_ids: list[WorkerId] = [] + for result in results: + wid_str = str(result.worker_id) + if result.error is not None: + ping_failures[wid_str] = ping_failures.get(wid_str, 0) + 1 + if ping_failures[wid_str] >= threshold: + dead_workers.append(wid_str) + logger.warning( + "Ping loop: worker %s exceeded failure threshold (%d)", + wid_str, + ping_failures[wid_str], + ) + else: + ping_failures.pop(wid_str, None) + if update_resources and result.resource_snapshot: + self._transitions.update_worker_ping_success(result.worker_id, result.resource_snapshot) + else: + liveness_ids.append(result.worker_id) + + if liveness_ids: + self._transitions.touch_worker_liveness(liveness_ids) + + if dead_workers: + failure_result = self._transitions.fail_workers_batch( + dead_workers, reason="ping failure threshold exceeded" + ) + for wid, addr in failure_result.removed_workers: + ping_failures.pop(str(wid), None) + self._provider.on_worker_failed(wid, addr) + + if self._autoscaler and failure_result.removed_workers: + actually_removed = [str(wid) for wid, _ in failure_result.removed_workers] + sibling_ids = self._autoscaler.terminate_slices_for_workers(actually_removed) + sibling_failures = self._transitions.fail_workers_batch( + sibling_ids, reason="sibling worker failed, slice terminated" + ) + for wid, addr in sibling_failures.removed_workers: + ping_failures.pop(str(wid), None) + self._provider.on_worker_failed(wid, addr) + failure_result.tasks_to_kill.update(sibling_failures.tasks_to_kill) + failure_result.task_kill_workers.update(sibling_failures.task_kill_workers) + + if failure_result.tasks_to_kill: + self.kill_tasks_on_workers(failure_result.tasks_to_kill, failure_result.task_kill_workers) + + # Clean up stale entries + active_ids = {str(wid) for wid, _ in workers} + for wid in list(ping_failures): + if wid not in active_ids: + del ping_failures[wid] + + except Exception: + logger.exception("Ping loop iteration failed") + + def _run_poll_loop(self, stop_event: threading.Event) -> None: + """Periodic full-state reconciliation for split heartbeat mode. + + Polls all workers via PollTasks every 60s and feeds results into the + task-updater queue for batched application. + """ + limiter = RateLimiter(interval_seconds=60.0) + while not stop_event.is_set(): + if not limiter.wait(cancel=stop_event): + break + if self._checkpoint_paused.is_set(): + continue + try: + self._poll_all_workers() + except Exception: + logger.exception("Poll loop iteration failed") + + def _poll_all_workers(self) -> None: + """Poll all workers for task state and feed results into the updater queue.""" + if self._config.dry_run: + return + running, addresses = self._transitions.get_running_tasks_for_poll() + if not running: + return + poll_results = self._provider.poll_workers(running, addresses) + for worker_id, updates, error in poll_results: + if error is not None: + logger.warning("PollTasks failed for worker %s: %s", worker_id, error) + continue + if updates: + self._task_update_queue.put( + HeartbeatApplyRequest( + worker_id=worker_id, + worker_resource_snapshot=None, + updates=updates, + ) + ) + + def _run_task_updater_loop(self, stop_event: threading.Event) -> None: + """Batched task state updater for split heartbeat mode. + + Drains the task-update queue every 1s and applies transitions in a + single batch. Kill requests resulting from transitions are sent directly. + """ + while not stop_event.is_set(): + if self._checkpoint_paused.is_set(): + stop_event.wait(1.0) + continue + requests = _drain_queue(self._task_update_queue, timeout=1.0) + if not requests or stop_event.is_set(): + continue + try: + results = self._transitions.apply_heartbeats_batch(requests) + all_tasks_to_kill: set[JobName] = set() + all_task_kill_workers: dict[JobName, WorkerId] = {} + for result in results: + all_tasks_to_kill.update(result.tasks_to_kill) + all_task_kill_workers.update(result.task_kill_workers) + if all_tasks_to_kill: + self._stop_tasks_direct(all_tasks_to_kill, all_task_kill_workers) + except Exception: + logger.exception("Task updater loop iteration failed") + def _reap_stale_workers(self) -> None: """Fail workers whose last heartbeat exceeds the staleness threshold. diff --git a/lib/iris/src/iris/cluster/controller/main.py b/lib/iris/src/iris/cluster/controller/main.py index fd107a37e5..80a7ef8531 100644 --- a/lib/iris/src/iris/cluster/controller/main.py +++ b/lib/iris/src/iris/cluster/controller/main.py @@ -128,6 +128,11 @@ def run_controller_serve( logger.info("Controller local state dir: %s (dry_run=%s)", local_state_dir, dry_run) heartbeat_failure_threshold = cluster_config.controller.heartbeat_failure_threshold or HEARTBEAT_FAILURE_THRESHOLD + use_split_heartbeat = ( + cluster_config.controller.use_split_heartbeat + if cluster_config.controller.HasField("use_split_heartbeat") + else True + ) # --- Restore or reuse local DB --- local_state_dir.mkdir(parents=True, exist_ok=True) @@ -247,6 +252,7 @@ def run_controller_serve( auth=auth, dry_run=dry_run, log_service_address=log_service_address, + use_split_heartbeat=use_split_heartbeat, ) controller = Controller( diff --git a/lib/iris/src/iris/cluster/controller/service.py b/lib/iris/src/iris/cluster/controller/service.py index 25de060e24..81b08c7ab2 100644 --- a/lib/iris/src/iris/cluster/controller/service.py +++ b/lib/iris/src/iris/cluster/controller/service.py @@ -90,7 +90,12 @@ from iris.cluster.controller.query import execute_raw_query from iris.rpc import query_pb2 from iris.cluster.controller.scheduler import SchedulingContext -from iris.cluster.controller.transitions import ControllerTransitions, TASK_RESOURCE_HISTORY_RETENTION +from iris.cluster.controller.transitions import ( + TASK_RESOURCE_HISTORY_RETENTION, + ControllerTransitions, + HeartbeatApplyRequest, + task_updates_from_proto, +) from iris.cluster.controller.provider import ProviderError from iris.cluster.log_store import build_log_source, worker_log_key from iris.cluster.process_status import get_process_status @@ -2601,3 +2606,30 @@ def get_scheduler_state( total_pending=total_pending, total_running=len(running_protos), ) + + # --- Worker Push --- + + def update_task_status( + self, + request: controller_pb2.Controller.UpdateTaskStatusRequest, + _ctx: Any, + ) -> controller_pb2.Controller.UpdateTaskStatusResponse: + """Worker pushes task state transitions to controller. + + Converts the proto updates into TaskUpdate dataclasses and applies + them through the same ControllerTransitions.apply_heartbeat() path + used by the poll-based heartbeat. Stop decisions are delivered via + the StopTasks RPC, not piggy-backed on the response. + """ + updates = task_updates_from_proto(request.updates) + if updates: + self._transitions.apply_heartbeat( + HeartbeatApplyRequest( + worker_id=WorkerId(request.worker_id), + worker_resource_snapshot=None, + updates=updates, + ) + ) + # Wake the controller so it can act on any state changes promptly. + self._controller.wake() + return controller_pb2.Controller.UpdateTaskStatusResponse() diff --git a/lib/iris/src/iris/cluster/controller/transitions.py b/lib/iris/src/iris/cluster/controller/transitions.py index c26df8652d..67d8e03561 100644 --- a/lib/iris/src/iris/cluster/controller/transitions.py +++ b/lib/iris/src/iris/cluster/controller/transitions.py @@ -13,7 +13,7 @@ import json import logging from dataclasses import dataclass, field -from collections.abc import Callable, Iterable +from collections.abc import Callable, Iterable, Sequence from typing import Any, NamedTuple from iris.cluster.constraints import AttributeValue, Constraint, constraints_from_resources, merge_constraints @@ -187,6 +187,30 @@ class TaskUpdate: container_id: str | None = None +def task_updates_from_proto(entries) -> list[TaskUpdate]: + """Convert worker-reported WorkerTaskStatus protos into TaskUpdates. + + Skips UNSPECIFIED/PENDING — the controller is only interested in + transitions to BUILDING or beyond. + """ + updates: list[TaskUpdate] = [] + for entry in entries: + if entry.state in (job_pb2.TASK_STATE_UNSPECIFIED, job_pb2.TASK_STATE_PENDING): + continue + updates.append( + TaskUpdate( + task_id=JobName.from_wire(entry.task_id), + attempt_id=entry.attempt_id, + new_state=entry.state, + error=entry.error or None, + exit_code=entry.exit_code if entry.HasField("exit_code") else None, + resource_usage=entry.resource_usage if entry.resource_usage.ByteSize() > 0 else None, + container_id=entry.container_id or None, + ) + ) + return updates + + @dataclass(frozen=True) class HeartbeatApplyRequest: """Batch of worker heartbeat updates applied atomically.""" @@ -219,6 +243,7 @@ class AssignmentResult(TxResult): accepted: list[Assignment] = field(default_factory=list) rejected: list[Assignment] = field(default_factory=list) + start_requests: list[tuple[WorkerId, str, job_pb2.RunTaskRequest]] = field(default_factory=list) @dataclass(frozen=True) @@ -1550,10 +1575,16 @@ def register_worker( ) return WorkerRegistrationResult(worker_id=worker_id) - def queue_assignments(self, assignments: list[Assignment]) -> AssignmentResult: - """Commit assignments and enqueue dispatches in one transaction.""" + def queue_assignments(self, assignments: list[Assignment], *, direct_dispatch: bool = False) -> AssignmentResult: + """Commit assignments and enqueue dispatches in one transaction. + + When direct_dispatch=True, collects (worker_id, address, RunTaskRequest) + tuples in start_requests instead of writing to the dispatch_queue table. + The caller is responsible for sending StartTasks RPCs. + """ accepted: list[Assignment] = [] rejected: list[Assignment] = [] + start_requests: list[tuple[WorkerId, str, job_pb2.RunTaskRequest]] = [] has_real_dispatch = False with self._db.transaction() as cur: now_ms = Timestamp.now().epoch_ms() @@ -1640,7 +1671,10 @@ def queue_assignments(self, assignments: list[Assignment]) -> AssignmentResult: constraints=[c.to_proto() for c in constraints_from_json(job.constraints_json)], task_image=job.task_image, ) - enqueue_run_dispatch(cur, str(assignment.worker_id), run_request.SerializeToString(), now_ms) + if direct_dispatch: + start_requests.append((assignment.worker_id, str(worker_row["address"]), run_request)) + else: + enqueue_run_dispatch(cur, str(assignment.worker_id), run_request.SerializeToString(), now_ms) has_real_dispatch = True cur.execute( "INSERT INTO worker_task_history(worker_id, task_id, assigned_at_ms) VALUES (?, ?, ?)", @@ -1658,7 +1692,11 @@ def queue_assignments(self, assignments: list[Assignment]) -> AssignmentResult: actions = [("assignment_queued", a.task_id.to_wire(), {"worker_id": str(a.worker_id)}) for a in accepted] self._record_transaction(cur, "queue_assignments", actions) return AssignmentResult( - tasks_to_kill=set(), has_real_dispatch=has_real_dispatch, accepted=accepted, rejected=rejected + tasks_to_kill=set(), + has_real_dispatch=has_real_dispatch, + accepted=accepted, + rejected=rejected, + start_requests=start_requests, ) def _update_worker_health(self, cur: TransactionCursor, req: HeartbeatApplyRequest, now_ms: int) -> bool: @@ -3000,6 +3038,78 @@ def _stopped() -> bool: return result + # ========================================================================= + # Split Heartbeat Helpers + # ========================================================================= + + def touch_worker_liveness(self, worker_ids: Sequence[WorkerId]) -> None: + """Cheap liveness bump: update last_heartbeat_ms without rewriting resources.""" + if not worker_ids: + return + now_ms = Timestamp.now().epoch_ms() + with self._db.transaction() as cur: + cur.executemany( + "UPDATE workers SET last_heartbeat_ms = ? WHERE worker_id = ?", + [(now_ms, str(wid)) for wid in worker_ids], + ) + + def update_worker_ping_success(self, worker_id: WorkerId, resource_snapshot: job_pb2.WorkerResourceSnapshot) -> None: + """Update worker timestamp and resource snapshot from a successful ping. + + Does not reset consecutive_failures — the ping loop tracks failures in-memory. + """ + snapshot_bytes = resource_snapshot.SerializeToString() + now_ms = Timestamp.now().epoch_ms() + with self._db.transaction() as cur: + cur.execute( + "UPDATE workers SET last_heartbeat_ms = ?, resource_snapshot_proto = ? WHERE worker_id = ?", + (now_ms, snapshot_bytes, str(worker_id)), + ) + cur.execute( + "INSERT INTO worker_resource_history(worker_id, snapshot_proto, timestamp_ms) VALUES (?, ?, ?)", + (str(worker_id), snapshot_bytes, now_ms), + ) + + def get_running_tasks_for_poll( + self, + ) -> tuple[dict[WorkerId, list[RunningTaskEntry]], dict[WorkerId, str]]: + """Snapshot running tasks and worker addresses for PollTasks RPCs. + + Returns (running_by_worker, worker_addresses) where running_by_worker + maps worker_id to its list of running task entries and worker_addresses + maps worker_id to its RPC address. + """ + with self._db.read_snapshot() as snap: + worker_rows = snap.fetchall("SELECT worker_id, address FROM workers WHERE active = 1 AND healthy = 1") + worker_addresses: dict[WorkerId, str] = {} + worker_ids: list[str] = [] + for row in worker_rows: + wid = WorkerId(str(row["worker_id"])) + worker_addresses[wid] = str(row["address"]) + worker_ids.append(str(row["worker_id"])) + + if not worker_ids: + return {}, {} + + placeholders = ",".join("?" for _ in worker_ids) + task_rows = snap.fetchall( + f"SELECT t.task_id, t.current_attempt_id, t.current_worker_id " + f"FROM tasks t " + f"WHERE t.current_worker_id IN ({placeholders}) AND t.state IN (?, ?, ?) " + f"ORDER BY t.task_id ASC", + (*worker_ids, *ACTIVE_TASK_STATES), + ) + + running: dict[WorkerId, list[RunningTaskEntry]] = {} + for row in task_rows: + wid = WorkerId(str(row["current_worker_id"])) + entry = RunningTaskEntry( + task_id=JobName.from_wire(str(row["task_id"])), + attempt_id=int(row["current_attempt_id"]), + ) + running.setdefault(wid, []).append(entry) + return running, worker_addresses + # ========================================================================= # Heartbeat Dispatch API # ========================================================================= diff --git a/lib/iris/src/iris/cluster/controller/worker_provider.py b/lib/iris/src/iris/cluster/controller/worker_provider.py index 84b0647d48..7eed9952ce 100644 --- a/lib/iris/src/iris/cluster/controller/worker_provider.py +++ b/lib/iris/src/iris/cluster/controller/worker_provider.py @@ -15,9 +15,11 @@ from iris.cluster.controller.transitions import ( DispatchBatch, HeartbeatApplyRequest, + RunningTaskEntry, TaskUpdate, + task_updates_from_proto, ) -from iris.cluster.types import JobName, WorkerId +from iris.cluster.types import WorkerId from iris.rpc import job_pb2 from iris.rpc import worker_pb2 from iris.rpc.worker_connect import WorkerServiceClient @@ -43,6 +45,18 @@ def _heartbeat_rpc_context( ) +@dataclass(frozen=True) +class PingResult: + """Result of a Ping RPC to a single worker.""" + + worker_id: WorkerId + worker_address: str | None + resource_snapshot: job_pb2.WorkerResourceSnapshot | None = None + healthy: bool = True + health_error: str = "" + error: str | None = None + + class WorkerStubFactory(Protocol): """Factory for getting cached async worker RPC stubs.""" @@ -89,25 +103,10 @@ def _apply_request_from_response( response: job_pb2.HeartbeatResponse, ) -> HeartbeatApplyRequest: """Convert a HeartbeatResponse proto to a HeartbeatApplyRequest.""" - updates: list[TaskUpdate] = [] - for entry in response.tasks: - if entry.state in (job_pb2.TASK_STATE_UNSPECIFIED, job_pb2.TASK_STATE_PENDING): - continue - updates.append( - TaskUpdate( - task_id=JobName.from_wire(entry.task_id), - attempt_id=entry.attempt_id, - new_state=entry.state, - error=entry.error or None, - exit_code=entry.exit_code if entry.HasField("exit_code") else None, - resource_usage=entry.resource_usage if entry.resource_usage.ByteSize() > 0 else None, - container_id=entry.container_id or None, - ) - ) return HeartbeatApplyRequest( worker_id=worker_id, worker_resource_snapshot=(response.resource_snapshot if response.resource_snapshot.ByteSize() > 0 else None), - updates=updates, + updates=task_updates_from_proto(response.tasks), ) @@ -246,5 +245,123 @@ def exec_in_container( rpc_timeout_ms = (timeout_seconds + 5) * 1000 return asyncio.run(stub.exec_in_container(request, timeout_ms=rpc_timeout_ms)) + def ping_workers(self, workers: list[tuple[WorkerId, str | None]]) -> list[PingResult]: + """Send Ping RPCs to all workers concurrently. Returns per-worker results.""" + if not workers: + return [] + + async def _one(sem: asyncio.Semaphore, wid: WorkerId, addr: str | None) -> PingResult: + async with sem: + if not addr: + return PingResult(worker_id=wid, worker_address=addr, error=f"Worker {wid} has no address") + try: + stub = self.stub_factory.get_stub(addr) + response = await stub.ping(worker_pb2.Worker.PingRequest()) + if not response.healthy: + return PingResult( + worker_id=wid, + worker_address=addr, + error=f"worker {wid} reported unhealthy: {response.health_error}", + ) + return PingResult( + worker_id=wid, + worker_address=addr, + resource_snapshot=( + response.resource_snapshot if response.resource_snapshot.ByteSize() > 0 else None + ), + healthy=response.healthy, + health_error=response.health_error, + ) + except Exception as e: + return PingResult(worker_id=wid, worker_address=addr, error=str(e)) + + async def _run() -> list[PingResult]: + sem = asyncio.Semaphore(self.parallelism) + return await asyncio.gather(*(_one(sem, wid, addr) for wid, addr in workers)) + + return asyncio.run(_run()) + + def start_tasks( + self, + jobs: list[tuple[WorkerId, str, list[job_pb2.RunTaskRequest]]], + ) -> list[tuple[WorkerId, worker_pb2.Worker.StartTasksResponse | None, str | None]]: + """Send StartTasks RPCs to many workers concurrently.""" + if not jobs: + return [] + + async def _one( + sem: asyncio.Semaphore, wid: WorkerId, addr: str, tasks: list[job_pb2.RunTaskRequest] + ) -> tuple[WorkerId, worker_pb2.Worker.StartTasksResponse | None, str | None]: + async with sem: + try: + stub = self.stub_factory.get_stub(addr) + response = await stub.start_tasks(worker_pb2.Worker.StartTasksRequest(tasks=tasks)) + return (wid, response, None) + except Exception as e: + return (wid, None, str(e)) + + async def _run() -> list[tuple[WorkerId, worker_pb2.Worker.StartTasksResponse | None, str | None]]: + sem = asyncio.Semaphore(self.parallelism) + return await asyncio.gather(*(_one(sem, wid, addr, tasks) for wid, addr, tasks in jobs)) + + return asyncio.run(_run()) + + def stop_tasks( + self, + jobs: list[tuple[WorkerId, str, list[str]]], + ) -> list[tuple[WorkerId, str | None]]: + """Send StopTasks RPCs to many workers concurrently.""" + if not jobs: + return [] + + async def _one(sem: asyncio.Semaphore, wid: WorkerId, addr: str, ids: list[str]) -> tuple[WorkerId, str | None]: + async with sem: + try: + stub = self.stub_factory.get_stub(addr) + await stub.stop_tasks(worker_pb2.Worker.StopTasksRequest(task_ids=ids)) + return (wid, None) + except Exception as e: + return (wid, str(e)) + + async def _run() -> list[tuple[WorkerId, str | None]]: + sem = asyncio.Semaphore(self.parallelism) + return await asyncio.gather(*(_one(sem, wid, addr, ids) for wid, addr, ids in jobs)) + + return asyncio.run(_run()) + + def poll_workers( + self, + running: dict[WorkerId, list[RunningTaskEntry]], + worker_addresses: dict[WorkerId, str], + ) -> list[tuple[WorkerId, list[TaskUpdate] | None, str | None]]: + """Poll all workers for task state via PollTasks RPC concurrently. + + Returns a list of (worker_id, updates_or_none, error_or_none). + """ + if not running: + return [] + + async def _one( + sem: asyncio.Semaphore, wid: WorkerId, entries: list[RunningTaskEntry], addr: str | None + ) -> tuple[WorkerId, list[TaskUpdate] | None, str | None]: + async with sem: + if not addr: + return (wid, None, f"Worker {wid} has no address") + try: + expected = [ + job_pb2.WorkerTaskStatus(task_id=e.task_id.to_wire(), attempt_id=e.attempt_id) for e in entries + ] + stub = self.stub_factory.get_stub(addr) + response = await stub.poll_tasks(worker_pb2.Worker.PollTasksRequest(expected_tasks=expected)) + return (wid, task_updates_from_proto(response.tasks), None) + except Exception as e: + return (wid, None, str(e)) + + async def _run() -> list[tuple[WorkerId, list[TaskUpdate] | None, str | None]]: + sem = asyncio.Semaphore(self.parallelism) + return await asyncio.gather(*(_one(sem, wid, running[wid], worker_addresses.get(wid)) for wid in running)) + + return asyncio.run(_run()) + def close(self) -> None: self.stub_factory.close() diff --git a/lib/iris/src/iris/cluster/worker/service.py b/lib/iris/src/iris/cluster/worker/service.py index 3b1c63d517..566626072f 100644 --- a/lib/iris/src/iris/cluster/worker/service.py +++ b/lib/iris/src/iris/cluster/worker/service.py @@ -35,6 +35,12 @@ def get_task(self, task_id: str, attempt_id: int = -1) -> TaskInfo | None: ... def list_tasks(self) -> list[TaskInfo]: ... def kill_task(self, task_id: str, term_timeout_ms: int = 5000) -> bool: ... def handle_heartbeat(self, request: job_pb2.HeartbeatRequest) -> job_pb2.HeartbeatResponse: ... + def handle_ping(self, request: worker_pb2.Worker.PingRequest) -> worker_pb2.Worker.PingResponse: ... + def handle_start_tasks( + self, request: worker_pb2.Worker.StartTasksRequest + ) -> worker_pb2.Worker.StartTasksResponse: ... + def handle_stop_tasks(self, request: worker_pb2.Worker.StopTasksRequest) -> worker_pb2.Worker.StopTasksResponse: ... + def handle_poll_tasks(self, request: worker_pb2.Worker.PollTasksRequest) -> worker_pb2.Worker.PollTasksResponse: ... def profile_task( self, task_id: str, duration_seconds: int, profile_type: job_pb2.ProfileType, attempt_id: int | None = None ) -> bytes: ... @@ -176,3 +182,25 @@ def exec_in_container( raise ConnectError(Code.INVALID_ARGUMENT, "command is required") timeout_seconds = request.timeout_seconds if request.timeout_seconds != 0 else 60 return self._provider.exec_in_container(request.task_id, list(request.command), timeout_seconds) + + def ping(self, request: worker_pb2.Worker.PingRequest, _ctx: RequestContext) -> worker_pb2.Worker.PingResponse: + with rpc_error_handler("ping"): + return self._provider.handle_ping(request) + + def start_tasks( + self, request: worker_pb2.Worker.StartTasksRequest, _ctx: RequestContext + ) -> worker_pb2.Worker.StartTasksResponse: + with rpc_error_handler("start_tasks"): + return self._provider.handle_start_tasks(request) + + def stop_tasks( + self, request: worker_pb2.Worker.StopTasksRequest, _ctx: RequestContext + ) -> worker_pb2.Worker.StopTasksResponse: + with rpc_error_handler("stop_tasks"): + return self._provider.handle_stop_tasks(request) + + def poll_tasks( + self, request: worker_pb2.Worker.PollTasksRequest, _ctx: RequestContext + ) -> worker_pb2.Worker.PollTasksResponse: + with rpc_error_handler("poll_tasks"): + return self._provider.handle_poll_tasks(request) diff --git a/lib/iris/src/iris/cluster/worker/task_attempt.py b/lib/iris/src/iris/cluster/worker/task_attempt.py index 669d41796e..131710b133 100644 --- a/lib/iris/src/iris/cluster/worker/task_attempt.py +++ b/lib/iris/src/iris/cluster/worker/task_attempt.py @@ -284,6 +284,7 @@ def __init__( self.thread: threading.Thread | None = None self.cleanup_done: bool = False self.should_stop: bool = False + self.on_state_change: Callable[[TaskState], None] | None = None @classmethod def adopt( @@ -479,6 +480,11 @@ def transition_to( self.error = error if exit_code is not None: self.exit_code = exit_code + if self.on_state_change is not None: + try: + self.on_state_change(state) + except Exception: + logger.debug("on_state_change callback failed", exc_info=True) def duration(self) -> Duration | None: """Calculate how long the attempt ran. diff --git a/lib/iris/src/iris/cluster/worker/worker.py b/lib/iris/src/iris/cluster/worker/worker.py index 689f13e325..c823177970 100644 --- a/lib/iris/src/iris/cluster/worker/worker.py +++ b/lib/iris/src/iris/cluster/worker/worker.py @@ -313,6 +313,7 @@ def adopt_running_containers(self) -> int: port_allocator=self._port_allocator, poll_interval_seconds=self._config.poll_interval.to_seconds(), ) + attempt.on_state_change = self._make_state_change_callback(attempt) key = (container.task_id, container.attempt_id) with self._lock: @@ -499,6 +500,51 @@ def _detach_log_handler(self) -> None: self._log_pusher.close() self._log_pusher = None + def _make_state_change_callback(self, attempt: TaskAttempt) -> Callable[[job_pb2.TaskState], None]: + """Build a closure that pushes a WorkerTaskStatus to the controller on transition. + + Runs synchronously on the TaskAttempt's own thread. RPC failures are + dropped — the controller's poll loop reconciles missed transitions. + """ + + def _on_state_change(new_state: job_pb2.TaskState) -> None: + client = self._controller_client + if client is None or not self._worker_id: + return + reported_state = new_state + if reported_state == job_pb2.TASK_STATE_PENDING: + reported_state = job_pb2.TASK_STATE_BUILDING + entry = job_pb2.WorkerTaskStatus( + task_id=attempt.task_id.to_wire(), + attempt_id=attempt.attempt_id, + state=reported_state, + exit_code=attempt.exit_code or 0, + error=attempt.error or "", + container_id=attempt.platform_container_id or "", + ) + if attempt.finished_at is not None: + entry.finished_at.CopyFrom(timestamp_to_proto(attempt.finished_at)) + usage = job_pb2.ResourceUsage( + memory_mb=attempt.current_memory_mb, + memory_peak_mb=attempt.peak_memory_mb, + disk_mb=attempt.disk_mb, + cpu_millicores=attempt.current_cpu_millicores, + process_count=attempt.process_count, + ) + if usage.ByteSize() > 0: + entry.resource_usage.CopyFrom(usage) + try: + client.update_task_status( + controller_pb2.Controller.UpdateTaskStatusRequest( + worker_id=self._worker_id, + updates=[entry], + ) + ) + except Exception as e: + logger.warning("UpdateTaskStatus push failed for %s: %s", attempt.task_id, e) + + return _on_state_change + def _resolve_address(self) -> str: """Resolve the address to advertise to the controller.""" metadata = self._worker_metadata @@ -651,6 +697,7 @@ def submit_task(self, request: job_pb2.RunTaskRequest) -> str: log_pusher=self._log_pusher, poll_interval_seconds=self._config.poll_interval.to_seconds(), ) + attempt.on_state_change = self._make_state_change_callback(attempt) with self._lock: self._tasks[key] = attempt @@ -707,6 +754,91 @@ def list_current_tasks(self) -> list[TaskInfo]: by_task[task_id] = task return list(by_task.values()) + def _encode_task_status(self, task: TaskAttempt, task_id: str) -> job_pb2.WorkerTaskStatus: + """Build a WorkerTaskStatus proto from a worker-side TaskAttempt. + + Maps PENDING → BUILDING because the controller treats PENDING as + "not yet picked up by a worker"; once a worker holds the task, the + earliest visible state to the controller is BUILDING. + """ + task_proto = task.to_proto() + reported_state = task.status + if reported_state == job_pb2.TASK_STATE_PENDING: + reported_state = job_pb2.TASK_STATE_BUILDING + entry = job_pb2.WorkerTaskStatus( + task_id=task_id, + attempt_id=task_proto.current_attempt_id, + state=reported_state, + exit_code=task_proto.exit_code, + error=task_proto.error or "", + container_id=task_proto.container_id or "", + ) + if task.status in self._TERMINAL_STATES: + entry.finished_at.CopyFrom(task_proto.finished_at) + if task_proto.resource_usage.ByteSize() > 0: + entry.resource_usage.CopyFrom(task_proto.resource_usage) + return entry + + @staticmethod + def _missing_task_status(task_id: str, expected_attempt_id: int) -> job_pb2.WorkerTaskStatus: + """Status for an expected task that the worker has no record of (lost state).""" + return job_pb2.WorkerTaskStatus( + task_id=task_id, + attempt_id=expected_attempt_id, + state=job_pb2.TASK_STATE_WORKER_FAILED, + exit_code=0, + error="Task not found on worker", + finished_at=timestamp_to_proto(Timestamp.now()), + ) + + def _reconcile_expected_tasks( + self, + expected_entries, + extra_expected_keys: set[tuple[str, int]] | None = None, + ) -> tuple[list[job_pb2.WorkerTaskStatus], list[tuple[str, int]]]: + """Build status entries for expected tasks; collect non-terminal local tasks + not in the expected set as targets to kill. + + Caller must hold ``self._lock``. + + ``extra_expected_keys`` keeps freshly-submitted tasks (e.g. ``tasks_to_run`` + on the legacy heartbeat) from being killed when they aren't yet in the + controller's expected set. + """ + tasks: list[job_pb2.WorkerTaskStatus] = [] + expected_keys: set[tuple[str, int]] = set() + for expected_entry in expected_entries: + task_id = expected_entry.task_id + expected_attempt_id = expected_entry.attempt_id + key = (task_id, expected_attempt_id) + expected_keys.add(key) + task = self._tasks.get(key) + if task is None: + tasks.append(self._missing_task_status(task_id, expected_attempt_id)) + else: + tasks.append(self._encode_task_status(task, task_id)) + if extra_expected_keys: + expected_keys |= extra_expected_keys + tasks_to_kill: list[tuple[str, int]] = [] + for key, task in self._tasks.items(): + if key not in expected_keys and task.status not in self._TERMINAL_STATES: + tasks_to_kill.append(key) + return tasks, tasks_to_kill + + def _collect_resource_metrics(self) -> job_pb2.WorkerResourceSnapshot: + """Collect host metrics with running-task and process aggregates filled in.""" + snapshot = self._host_metrics.collect() + running_count = 0 + total_processes = 0 + with self._lock: + for task in self._tasks.values(): + if task.status == job_pb2.TASK_STATE_RUNNING: + running_count += 1 + total_processes += task.process_count + snapshot.running_task_count = running_count + snapshot.total_process_count = total_processes + return snapshot + def handle_heartbeat(self, request: job_pb2.HeartbeatRequest) -> job_pb2.HeartbeatResponse: """Handle controller-initiated heartbeat with reconciliation. @@ -758,60 +890,13 @@ def handle_heartbeat(self, request: job_pb2.HeartbeatRequest) -> job_pb2.Heartbe except Exception as e: logger.warning("Heartbeat: failed to kill task %s: %s", task_id, e) - tasks: list[job_pb2.WorkerTaskStatus] = [] - with slow_log(logger, "heartbeat reconciliation", threshold_ms=200): + # tasks_to_run was just submitted above; carry those keys so a + # newly-assigned task isn't killed if the controller hasn't yet + # listed it in expected_tasks. + extra_keys = {(r.task_id, r.attempt_id) for r in request.tasks_to_run} with self._lock: - # Reconcile expected_tasks against actual state - for expected_entry in request.expected_tasks: - task_id = expected_entry.task_id - expected_attempt_id = expected_entry.attempt_id - key = (task_id, expected_attempt_id) - task = self._tasks.get(key) - - if task is None: - tasks.append( - job_pb2.WorkerTaskStatus( - task_id=task_id, - attempt_id=expected_attempt_id, - state=job_pb2.TASK_STATE_WORKER_FAILED, - exit_code=0, - error="Task not found on worker", - finished_at=timestamp_to_proto(Timestamp.now()), - ) - ) - else: - task_proto = task.to_proto() - reported_state = task.status - if reported_state == job_pb2.TASK_STATE_PENDING: - reported_state = job_pb2.TASK_STATE_BUILDING - - entry = job_pb2.WorkerTaskStatus( - task_id=task_id, - attempt_id=task_proto.current_attempt_id, - state=reported_state, - exit_code=task_proto.exit_code, - error=task_proto.error or "", - container_id=task_proto.container_id or "", - ) - if task.status in self._TERMINAL_STATES: - entry.finished_at.CopyFrom(task_proto.finished_at) - if task_proto.resource_usage.ByteSize() > 0: - entry.resource_usage.CopyFrom(task_proto.resource_usage) - tasks.append(entry) - - # Kill tasks not in expected_tasks - the controller has decided these - # tasks should no longer run (e.g., job was killed, task was reassigned). - # Include tasks_to_run in the expected set: these were just submitted - # in this heartbeat and may not yet appear in expected_tasks if the - # controller excludes unconfirmed tasks. - expected_keys = {(entry.task_id, entry.attempt_id) for entry in request.expected_tasks} - for run_req in request.tasks_to_run: - expected_keys.add((run_req.task_id, run_req.attempt_id)) - tasks_to_kill: list[tuple[str, int]] = [] - for key, task in self._tasks.items(): - if key not in expected_keys and task.status not in self._TERMINAL_STATES: - tasks_to_kill.append(key) + tasks, tasks_to_kill = self._reconcile_expected_tasks(request.expected_tasks, extra_keys) # Kill removed tasks asynchronously outside lock to avoid deadlock for task_id, attempt_id in tasks_to_kill: @@ -820,16 +905,7 @@ def handle_heartbeat(self, request: job_pb2.HeartbeatRequest) -> job_pb2.Heartbe # Collect host metrics and aggregate task stats with slow_log(logger, "heartbeat host_metrics", threshold_ms=100): - resource_snapshot = self._host_metrics.collect() - running_count = 0 - total_processes = 0 - with self._lock: - for task in self._tasks.values(): - if task.status == job_pb2.TASK_STATE_RUNNING: - running_count += 1 - total_processes += task.process_count - resource_snapshot.running_task_count = running_count - resource_snapshot.total_process_count = total_processes + resource_snapshot = self._collect_resource_metrics() # Run health checks to detect local faults (disk full, write failure) with slow_log(logger, "heartbeat health_check", threshold_ms=100): @@ -844,6 +920,53 @@ def handle_heartbeat(self, request: job_pb2.HeartbeatRequest) -> job_pb2.Heartbe health_error=health.error, ) + def handle_ping(self, request: worker_pb2.Worker.PingRequest) -> worker_pb2.Worker.PingResponse: + """Liveness check. Resets heartbeat deadline, returns resource snapshot and health.""" + self._heartbeat_deadline = Deadline.from_seconds(self._config.heartbeat_timeout.to_seconds()) + resource_snapshot = self._collect_resource_metrics() + health = check_worker_health(disk_path=str(self._cache_dir)) + if not health.healthy: + logger.warning("Worker health check failed: %s", health.error) + return worker_pb2.Worker.PingResponse( + resource_snapshot=resource_snapshot, + healthy=health.healthy, + health_error=health.error, + ) + + def handle_start_tasks(self, request: worker_pb2.Worker.StartTasksRequest) -> worker_pb2.Worker.StartTasksResponse: + """Start task attempts on this worker. Returns per-task ack.""" + acks = [] + for run_req in request.tasks: + try: + self.submit_task(run_req) + logger.info("StartTasks: submitted task %s", run_req.task_id) + acks.append(worker_pb2.Worker.TaskAck(task_id=run_req.task_id, accepted=True)) + except Exception as e: + logger.warning("StartTasks: failed to submit task %s: %s", run_req.task_id, e) + acks.append(worker_pb2.Worker.TaskAck(task_id=run_req.task_id, accepted=False, error=str(e))) + return worker_pb2.Worker.StartTasksResponse(acks=acks) + + def handle_stop_tasks(self, request: worker_pb2.Worker.StopTasksRequest) -> worker_pb2.Worker.StopTasksResponse: + """Stop given tasks on this worker.""" + for task_id in request.task_ids: + try: + current = self._get_current_attempt(task_id) + if current: + self._kill_task_attempt(task_id, current.attempt_id, async_kill=True) + logger.info("StopTasks: initiated async kill for task %s", task_id) + except Exception as e: + logger.warning("StopTasks: failed to kill task %s: %s", task_id, e) + return worker_pb2.Worker.StopTasksResponse() + + def handle_poll_tasks(self, request: worker_pb2.Worker.PollTasksRequest) -> worker_pb2.Worker.PollTasksResponse: + """Report status of expected tasks and kill unexpected tasks.""" + with self._lock: + tasks, tasks_to_kill = self._reconcile_expected_tasks(request.expected_tasks) + for task_id, attempt_id in tasks_to_kill: + logger.warning("PollTasks: killing task %s attempt %d (unexpected)", task_id, attempt_id) + self._kill_task_attempt(task_id, attempt_id, async_kill=True) + return worker_pb2.Worker.PollTasksResponse(tasks=tasks) + def _kill_task_attempt( self, task_id: str, diff --git a/lib/iris/src/iris/rpc/config.proto b/lib/iris/src/iris/rpc/config.proto index 3c7a1bdcd1..684840d01d 100644 --- a/lib/iris/src/iris/rpc/config.proto +++ b/lib/iris/src/iris/rpc/config.proto @@ -363,6 +363,7 @@ message ControllerVmConfig { string image = 10; // Controller docker image (shared by all controller types) iris.time.Duration worker_timeout = 12; // Default: 60s int32 heartbeat_failure_threshold = 13; // Consecutive heartbeat failures before marking worker dead (default: 10) + bool use_split_heartbeat = 14; // Default: true. Set false to use the legacy monolithic heartbeat path. oneof controller { GcpControllerConfig gcp = 1; ManualControllerConfig manual = 2; diff --git a/lib/iris/src/iris/rpc/config_pb2.py b/lib/iris/src/iris/rpc/config_pb2.py index f2336677fe..f4ab9ec995 100644 --- a/lib/iris/src/iris/rpc/config_pb2.py +++ b/lib/iris/src/iris/rpc/config_pb2.py @@ -25,7 +25,7 @@ from . import time_pb2 as time__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x63onfig.proto\x12\x0biris.config\x1a\ntime.proto\"H\n\x11GcpPlatformConfig\x12\x1d\n\nproject_id\x18\x01 \x01(\tR\tprojectId\x12\x14\n\x05zones\x18\x05 \x03(\tR\x05zones\"\x16\n\x14ManualPlatformConfig\"\x15\n\x13LocalPlatformConfig\"\xb0\x01\n\x17\x43oreweavePlatformConfig\x12\x16\n\x06region\x18\x01 \x01(\tR\x06region\x12\x1c\n\tnamespace\x18\x02 \x01(\tR\tnamespace\x12\'\n\x0fkubeconfig_path\x18\x03 \x01(\tR\x0ekubeconfigPath\x12\x36\n\x17object_storage_endpoint\x18\x04 \x01(\tR\x15objectStorageEndpoint\"\xb0\x02\n\x0ePlatformConfig\x12!\n\x0clabel_prefix\x18\n \x01(\tR\x0blabelPrefix\x12\x32\n\x03gcp\x18\x01 \x01(\x0b\x32\x1e.iris.config.GcpPlatformConfigH\x00R\x03gcp\x12;\n\x06manual\x18\x02 \x01(\x0b\x32!.iris.config.ManualPlatformConfigH\x00R\x06manual\x12\x38\n\x05local\x18\x03 \x01(\x0b\x32 .iris.config.LocalPlatformConfigH\x00R\x05local\x12\x44\n\tcoreweave\x18\x04 \x01(\x0b\x32$.iris.config.CoreweavePlatformConfigH\x00R\tcoreweaveB\n\n\x08platform\"c\n\x0eManualProvider\x12\x14\n\x05hosts\x18\x01 \x03(\tR\x05hosts\x12\x19\n\x08ssh_user\x18\x02 \x01(\tR\x07sshUser\x12 \n\x0cssh_key_file\x18\x03 \x01(\tR\nsshKeyFile\"\x98\x01\n\x0bGcpVmConfig\x12\x12\n\x04zone\x18\x01 \x01(\tR\x04zone\x12!\n\x0cmachine_type\x18\x02 \x01(\tR\x0bmachineType\x12)\n\x11\x62oot_disk_size_gb\x18\x03 \x01(\x05R\x0e\x62ootDiskSizeGb\x12\'\n\x0fservice_account\x18\x04 \x01(\tR\x0eserviceAccount\"a\n\x0eManualVmConfig\x12\x12\n\x04host\x18\x01 \x01(\tR\x04host\x12\x19\n\x08ssh_user\x18\x02 \x01(\tR\x07sshUser\x12 \n\x0cssh_key_file\x18\x03 \x01(\tR\nsshKeyFile\"\x83\x03\n\x08VmConfig\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x39\n\x06labels\x18\x02 \x03(\x0b\x32!.iris.config.VmConfig.LabelsEntryR\x06labels\x12?\n\x08metadata\x18\x03 \x03(\x0b\x32#.iris.config.VmConfig.MetadataEntryR\x08metadata\x12,\n\x03gcp\x18\n \x01(\x0b\x32\x18.iris.config.GcpVmConfigH\x00R\x03gcp\x12\x35\n\x06manual\x18\x0b \x01(\x0b\x32\x1b.iris.config.ManualVmConfigH\x00R\x06manual\x1a\x39\n\x0bLabelsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\n\n\x08platform\"\xc4\x02\n\x0eGcpSliceConfig\x12<\n\x04mode\x18\x06 \x01(\x0e\x32(.iris.config.GcpSliceConfig.GcpSliceModeR\x04mode\x12\x12\n\x04zone\x18\x01 \x01(\tR\x04zone\x12\'\n\x0fruntime_version\x18\x02 \x01(\tR\x0eruntimeVersion\x12\x1a\n\x08topology\x18\x04 \x01(\tR\x08topology\x12!\n\x0cmachine_type\x18\x07 \x01(\tR\x0bmachineType\x12\'\n\x0fservice_account\x18\t \x01(\tR\x0eserviceAccount\"=\n\x0cGcpSliceMode\x12\x16\n\x12GCP_SLICE_MODE_TPU\x10\x00\x12\x15\n\x11GCP_SLICE_MODE_VM\x10\x01J\x04\x08\x03\x10\x04J\x04\x08\x05\x10\x06J\x04\x08\x08\x10\t\"\x90\x01\n\x14\x43oreweaveSliceConfig\x12\x16\n\x06region\x18\x01 \x01(\tR\x06region\x12#\n\rinstance_type\x18\x02 \x01(\tR\x0cinstanceType\x12\x1b\n\tgpu_class\x18\x04 \x01(\tR\x08gpuClass\x12\x1e\n\ninfiniband\x18\x05 \x01(\x08R\ninfiniband\"f\n\x11ManualSliceConfig\x12\x14\n\x05hosts\x18\x01 \x03(\tR\x05hosts\x12\x19\n\x08ssh_user\x18\x02 \x01(\tR\x07sshUser\x12 \n\x0cssh_key_file\x18\x03 \x01(\tR\nsshKeyFile\"\x12\n\x10LocalSliceConfig\"\xb0\x05\n\x0bSliceConfig\x12\x1f\n\x0bname_prefix\x18\x01 \x01(\tR\nnamePrefix\x12\x17\n\x07num_vms\x18\x02 \x01(\x05R\x06numVms\x12G\n\x10\x61\x63\x63\x65lerator_type\x18\x03 \x01(\x0e\x32\x1c.iris.config.AcceleratorTypeR\x0f\x61\x63\x63\x65leratorType\x12/\n\x13\x61\x63\x63\x65lerator_variant\x18\x04 \x01(\tR\x12\x61\x63\x63\x65leratorVariant\x12<\n\x06labels\x18\x05 \x03(\x0b\x32$.iris.config.SliceConfig.LabelsEntryR\x06labels\x12\x1b\n\tgpu_count\x18\x07 \x01(\x05R\x08gpuCount\x12 \n\x0c\x64isk_size_gb\x18\x08 \x01(\x05R\ndiskSizeGb\x12>\n\rcapacity_type\x18\x0e \x01(\x0e\x32\x19.iris.config.CapacityTypeR\x0c\x63\x61pacityType\x12/\n\x03gcp\x18\n \x01(\x0b\x32\x1b.iris.config.GcpSliceConfigH\x00R\x03gcp\x12\x41\n\tcoreweave\x18\x0b \x01(\x0b\x32!.iris.config.CoreweaveSliceConfigH\x00R\tcoreweave\x12\x38\n\x06manual\x18\x0c \x01(\x0b\x32\x1e.iris.config.ManualSliceConfigH\x00R\x06manual\x12\x35\n\x05local\x18\r \x01(\x0b\x32\x1d.iris.config.LocalSliceConfigH\x00R\x05local\x1a\x39\n\x0bLabelsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\n\n\x08platformJ\x04\x08\x06\x10\x07\"\xd9\x02\n\x13ScaleGroupResources\x12%\n\x0e\x63pu_millicores\x18\x01 \x01(\x05R\rcpuMillicores\x12!\n\x0cmemory_bytes\x18\x02 \x01(\x03R\x0bmemoryBytes\x12\x1d\n\ndisk_bytes\x18\x03 \x01(\x03R\tdiskBytes\x12=\n\x0b\x64\x65vice_type\x18\n \x01(\x0e\x32\x1c.iris.config.AcceleratorTypeR\ndeviceType\x12%\n\x0e\x64\x65vice_variant\x18\x0b \x01(\tR\rdeviceVariant\x12!\n\x0c\x64\x65vice_count\x18\x0c \x01(\x05R\x0b\x64\x65viceCount\x12>\n\rcapacity_type\x18\x15 \x01(\x0e\x32\x19.iris.config.CapacityTypeR\x0c\x63\x61pacityTypeJ\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x14\x10\x15\"\x9c\x01\n\x0eWorkerSettings\x12K\n\nattributes\x18\x01 \x03(\x0b\x32+.iris.config.WorkerSettings.AttributesEntryR\nattributes\x1a=\n\x0f\x41ttributesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\"\xe0\x04\n\x10ScaleGroupConfig\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12*\n\rbuffer_slices\x18\x03 \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x0c\x62ufferSlices\x12$\n\nmax_slices\x18\x04 \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\tmaxSlices\x12>\n\tresources\x18\x0e \x01(\x0b\x32 .iris.config.ScaleGroupResourcesR\tresources\x12\x1e\n\x07num_vms\x18\x0f \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x06numVms\x12!\n\x08priority\x18\x1e \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x08priority\x12\x34\n\x13scale_up_rate_limit\x18\x46 \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x10scaleUpRateLimit\x12\x38\n\x15scale_down_rate_limit\x18G \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x12scaleDownRateLimit\x12?\n\x0eslice_template\x18\x32 \x01(\x0b\x32\x18.iris.config.SliceConfigR\rsliceTemplate\x12\x33\n\x06worker\x18< \x01(\x0b\x32\x1b.iris.config.WorkerSettingsR\x06worker\x12\x1d\n\nquota_pool\x18P \x01(\tR\tquotaPool\x12.\n\x0f\x61llocation_tier\x18Q \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x0e\x61llocationTierJ\x04\x08\x02\x10\x03J\x04\x08\t\x10\nJ\x04\x08\n\x10\x0bJ\x04\x08\x0b\x10\x0cJ\x04\x08\x0c\x10\rJ\x04\x08\r\x10\x0eJ\x04\x08\x14\x10\x15J\x04\x08(\x10)\"\xbe\x08\n\x0cWorkerConfig\x12!\n\x0c\x64ocker_image\x18\x01 \x01(\tR\x0b\x64ockerImage\x12\x12\n\x04host\x18\x02 \x01(\tR\x04host\x12\x12\n\x04port\x18\x03 \x01(\x05R\x04port\x12\x1d\n\nport_range\x18\x04 \x01(\tR\tportRange\x12\x1b\n\tworker_id\x18\x05 \x01(\tR\x08workerId\x12-\n\x12\x63ontroller_address\x18\x06 \x01(\tR\x11\x63ontrollerAddress\x12\x1b\n\tcache_dir\x18\x07 \x01(\tR\x08\x63\x61\x63heDir\x12,\n\x12\x64\x65\x66\x61ult_task_image\x18\t \x01(\tR\x10\x64\x65\x66\x61ultTaskImage\x12\x41\n\x08task_env\x18\n \x03(\x0b\x32&.iris.config.WorkerConfig.TaskEnvEntryR\x07taskEnv\x12\x18\n\x07runtime\x18\x0b \x01(\tR\x07runtime\x12G\n\x10\x61\x63\x63\x65lerator_type\x18\x14 \x01(\x0e\x32\x1c.iris.config.AcceleratorTypeR\x0f\x61\x63\x63\x65leratorType\x12/\n\x13\x61\x63\x63\x65lerator_variant\x18\x15 \x01(\tR\x12\x61\x63\x63\x65leratorVariant\x12\x1b\n\tgpu_count\x18\x16 \x01(\x05R\x08gpuCount\x12>\n\rcapacity_type\x18\x18 \x01(\x0e\x32\x19.iris.config.CapacityTypeR\x0c\x63\x61pacityType\x12\\\n\x11worker_attributes\x18\x1e \x03(\x0b\x32/.iris.config.WorkerConfig.WorkerAttributesEntryR\x10workerAttributes\x12\x38\n\rpoll_interval\x18( \x01(\x0b\x32\x13.iris.time.DurationR\x0cpollInterval\x12@\n\x11heartbeat_timeout\x18) \x01(\x0b\x32\x13.iris.time.DurationR\x10heartbeatTimeout\x12\x19\n\x08slice_id\x18\x08 \x01(\tR\x07sliceId\x12\x37\n\x08platform\x18\x32 \x01(\x0b\x32\x1b.iris.config.PlatformConfigR\x08platform\x12%\n\x0estorage_prefix\x18< \x01(\tR\rstoragePrefix\x12\x1d\n\nauth_token\x18\x46 \x01(\tR\tauthToken\x1a:\n\x0cTaskEnvEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x43\n\x15WorkerAttributesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01J\x04\x08\x17\x10\x18\"\xf2\x02\n\tSshConfig\x12\x12\n\x04user\x18\x01 \x01(\tR\x04user\x12\x19\n\x08key_file\x18\x02 \x01(\tR\x07keyFile\x12\x12\n\x04port\x18\x03 \x01(\x05R\x04port\x12<\n\x0f\x63onnect_timeout\x18\x04 \x01(\x0b\x32\x13.iris.time.DurationR\x0e\x63onnectTimeout\x12<\n\tauth_mode\x18\x05 \x01(\x0e\x32\x1f.iris.config.SshConfig.AuthModeR\x08\x61uthMode\x12\"\n\ros_login_user\x18\x06 \x01(\tR\x0bosLoginUser\x12>\n\x1bimpersonate_service_account\x18\x07 \x01(\tR\x19impersonateServiceAccount\"B\n\x08\x41uthMode\x12\x1a\n\x16SSH_AUTH_MODE_METADATA\x10\x00\x12\x1a\n\x16SSH_AUTH_MODE_OS_LOGIN\x10\x01\"a\n\rStorageConfig\x12&\n\x0flocal_state_dir\x18\x01 \x01(\tR\rlocalStateDir\x12(\n\x10remote_state_dir\x18\x02 \x01(\tR\x0eremoteStateDir\"\xb4\x01\n\x13GcpControllerConfig\x12\x12\n\x04zone\x18\x01 \x01(\tR\x04zone\x12!\n\x0cmachine_type\x18\x02 \x01(\tR\x0bmachineType\x12)\n\x11\x62oot_disk_size_gb\x18\x03 \x01(\x05R\x0e\x62ootDiskSizeGb\x12\x12\n\x04port\x18\x04 \x01(\x05R\x04port\x12\'\n\x0fservice_account\x18\x05 \x01(\tR\x0eserviceAccount\"@\n\x16ManualControllerConfig\x12\x12\n\x04host\x18\x01 \x01(\tR\x04host\x12\x12\n\x04port\x18\x03 \x01(\x05R\x04port\"+\n\x15LocalControllerConfig\x12\x12\n\x04port\x18\x01 \x01(\x05R\x04port\"s\n\x19\x43oreweaveControllerConfig\x12\x12\n\x04port\x18\x01 \x01(\x05R\x04port\x12!\n\x0cservice_name\x18\x02 \x01(\tR\x0bserviceName\x12\x1f\n\x0bscale_group\x18\x03 \x01(\tR\nscaleGroup\"\xad\x03\n\x12\x43ontrollerVmConfig\x12\x14\n\x05image\x18\n \x01(\tR\x05image\x12:\n\x0eworker_timeout\x18\x0c \x01(\x0b\x32\x13.iris.time.DurationR\rworkerTimeout\x12>\n\x1bheartbeat_failure_threshold\x18\r \x01(\x05R\x19heartbeatFailureThreshold\x12\x34\n\x03gcp\x18\x01 \x01(\x0b\x32 .iris.config.GcpControllerConfigH\x00R\x03gcp\x12=\n\x06manual\x18\x02 \x01(\x0b\x32#.iris.config.ManualControllerConfigH\x00R\x06manual\x12:\n\x05local\x18\x03 \x01(\x0b\x32\".iris.config.LocalControllerConfigH\x00R\x05local\x12\x46\n\tcoreweave\x18\x04 \x01(\x0b\x32&.iris.config.CoreweaveControllerConfigH\x00R\tcoreweaveB\x0c\n\ncontroller\"\xe4\x02\n\x10\x41utoscalerConfig\x12\x44\n\x13\x65valuation_interval\x18\x01 \x01(\x0b\x32\x13.iris.time.DurationR\x12\x65valuationInterval\x12\x39\n\x0escale_up_delay\x18\x03 \x01(\x0b\x32\x13.iris.time.DurationR\x0cscaleUpDelay\x12=\n\x10scale_down_delay\x18\x04 \x01(\x0b\x32\x13.iris.time.DurationR\x0escaleDownDelay\x12\x45\n\x14startup_grace_period\x18\x05 \x01(\x0b\x32\x13.iris.time.DurationR\x12startupGracePeriod\x12I\n\x16heartbeat_grace_period\x18\x06 \x01(\x0b\x32\x13.iris.time.DurationR\x14heartbeatGracePeriod\"\xb9\x02\n\x0e\x44\x65\x66\x61ultsConfig\x12(\n\x03ssh\x18\x02 \x01(\x0b\x32\x16.iris.config.SshConfigR\x03ssh\x12=\n\nautoscaler\x18\x03 \x01(\x0b\x32\x1d.iris.config.AutoscalerConfigR\nautoscaler\x12\x31\n\x06worker\x18\x06 \x01(\x0b\x32\x19.iris.config.WorkerConfigR\x06worker\x12\x43\n\x08task_env\x18\x07 \x03(\x0b\x32(.iris.config.DefaultsConfig.TaskEnvEntryR\x07taskEnv\x1a:\n\x0cTaskEnvEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06\".\n\rGcpAuthConfig\x12\x1d\n\nproject_id\x18\x01 \x01(\tR\tprojectId\"\x90\x01\n\x10StaticAuthConfig\x12\x41\n\x06tokens\x18\x01 \x03(\x0b\x32).iris.config.StaticAuthConfig.TokensEntryR\x06tokens\x1a\x39\n\x0bTokensEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\"\xbe\x01\n\nAuthConfig\x12.\n\x03gcp\x18\x01 \x01(\x0b\x32\x1a.iris.config.GcpAuthConfigH\x00R\x03gcp\x12\x37\n\x06static\x18\x02 \x01(\x0b\x32\x1d.iris.config.StaticAuthConfigH\x00R\x06static\x12\x1f\n\x0b\x61\x64min_users\x18\x03 \x03(\tR\nadminUsers\x12\x1a\n\x08optional\x18\x04 \x01(\x08R\x08optionalB\n\n\x08provider\"\x16\n\x14WorkerProviderConfig\"\xcd\x02\n\x18KubernetesProviderConfig\x12\x1c\n\tnamespace\x18\x01 \x01(\tR\tnamespace\x12\x1e\n\nkubeconfig\x18\x02 \x01(\tR\nkubeconfig\x12#\n\rdefault_image\x18\x03 \x01(\tR\x0c\x64\x65\x66\x61ultImage\x12\x36\n\x17\x63olocation_topology_key\x18\x04 \x01(\tR\x15\x63olocationTopologyKey\x12\'\n\x0fservice_account\x18\x05 \x01(\tR\x0eserviceAccount\x12!\n\x0chost_network\x18\x06 \x01(\x08R\x0bhostNetwork\x12\x1b\n\tcache_dir\x18\x07 \x01(\tR\x08\x63\x61\x63heDir\x12-\n\x12\x63ontroller_address\x18\x08 \x01(\tR\x11\x63ontrollerAddress\"\xa4\x05\n\x11IrisClusterConfig\x12\x12\n\x04name\x18\x05 \x01(\tR\x04name\x12\x37\n\x08platform\x18\n \x01(\x0b\x32\x1b.iris.config.PlatformConfigR\x08platform\x12\x37\n\x08\x64\x65\x66\x61ults\x18\x0b \x01(\x0b\x32\x1b.iris.config.DefaultsConfigR\x08\x64\x65\x66\x61ults\x12\x34\n\x07storage\x18\x0c \x01(\x0b\x32\x1a.iris.config.StorageConfigR\x07storage\x12?\n\ncontroller\x18\x1f \x01(\x0b\x32\x1f.iris.config.ControllerVmConfigR\ncontroller\x12R\n\x0cscale_groups\x18\x32 \x03(\x0b\x32/.iris.config.IrisClusterConfig.ScaleGroupsEntryR\x0bscaleGroups\x12+\n\x04\x61uth\x18< \x01(\x0b\x32\x17.iris.config.AuthConfigR\x04\x61uth\x12X\n\x13kubernetes_provider\x18\x46 \x01(\x0b\x32%.iris.config.KubernetesProviderConfigH\x00R\x12kubernetesProvider\x12L\n\x0fworker_provider\x18G \x01(\x0b\x32!.iris.config.WorkerProviderConfigH\x00R\x0eworkerProvider\x1a]\n\x10ScaleGroupsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x33\n\x05value\x18\x02 \x01(\x0b\x32\x1d.iris.config.ScaleGroupConfigR\x05value:\x02\x38\x01\x42\n\n\x08provider*\x81\x01\n\x0f\x41\x63\x63\x65leratorType\x12 \n\x1c\x41\x43\x43\x45LERATOR_TYPE_UNSPECIFIED\x10\x00\x12\x18\n\x14\x41\x43\x43\x45LERATOR_TYPE_CPU\x10\x01\x12\x18\n\x14\x41\x43\x43\x45LERATOR_TYPE_GPU\x10\x02\x12\x18\n\x14\x41\x43\x43\x45LERATOR_TYPE_TPU\x10\x03*\x85\x01\n\x0c\x43\x61pacityType\x12\x1d\n\x19\x43\x41PACITY_TYPE_UNSPECIFIED\x10\x00\x12\x1d\n\x19\x43\x41PACITY_TYPE_PREEMPTIBLE\x10\x01\x12\x1b\n\x17\x43\x41PACITY_TYPE_ON_DEMAND\x10\x02\x12\x1a\n\x16\x43\x41PACITY_TYPE_RESERVED\x10\x03\x42p\n\x0f\x63om.iris.configB\x0b\x43onfigProtoP\x01\xa2\x02\x03ICX\xaa\x02\x0bIris.Config\xca\x02\x0bIris\\Config\xe2\x02\x17Iris\\Config\\GPBMetadata\xea\x02\x0cIris::Config\x92\x03\x02\x08\x01\x62\x08\x65\x64itionsp\xe8\x07') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x63onfig.proto\x12\x0biris.config\x1a\ntime.proto\"H\n\x11GcpPlatformConfig\x12\x1d\n\nproject_id\x18\x01 \x01(\tR\tprojectId\x12\x14\n\x05zones\x18\x05 \x03(\tR\x05zones\"\x16\n\x14ManualPlatformConfig\"\x15\n\x13LocalPlatformConfig\"\xb0\x01\n\x17\x43oreweavePlatformConfig\x12\x16\n\x06region\x18\x01 \x01(\tR\x06region\x12\x1c\n\tnamespace\x18\x02 \x01(\tR\tnamespace\x12\'\n\x0fkubeconfig_path\x18\x03 \x01(\tR\x0ekubeconfigPath\x12\x36\n\x17object_storage_endpoint\x18\x04 \x01(\tR\x15objectStorageEndpoint\"\xb0\x02\n\x0ePlatformConfig\x12!\n\x0clabel_prefix\x18\n \x01(\tR\x0blabelPrefix\x12\x32\n\x03gcp\x18\x01 \x01(\x0b\x32\x1e.iris.config.GcpPlatformConfigH\x00R\x03gcp\x12;\n\x06manual\x18\x02 \x01(\x0b\x32!.iris.config.ManualPlatformConfigH\x00R\x06manual\x12\x38\n\x05local\x18\x03 \x01(\x0b\x32 .iris.config.LocalPlatformConfigH\x00R\x05local\x12\x44\n\tcoreweave\x18\x04 \x01(\x0b\x32$.iris.config.CoreweavePlatformConfigH\x00R\tcoreweaveB\n\n\x08platform\"c\n\x0eManualProvider\x12\x14\n\x05hosts\x18\x01 \x03(\tR\x05hosts\x12\x19\n\x08ssh_user\x18\x02 \x01(\tR\x07sshUser\x12 \n\x0cssh_key_file\x18\x03 \x01(\tR\nsshKeyFile\"\x98\x01\n\x0bGcpVmConfig\x12\x12\n\x04zone\x18\x01 \x01(\tR\x04zone\x12!\n\x0cmachine_type\x18\x02 \x01(\tR\x0bmachineType\x12)\n\x11\x62oot_disk_size_gb\x18\x03 \x01(\x05R\x0e\x62ootDiskSizeGb\x12\'\n\x0fservice_account\x18\x04 \x01(\tR\x0eserviceAccount\"a\n\x0eManualVmConfig\x12\x12\n\x04host\x18\x01 \x01(\tR\x04host\x12\x19\n\x08ssh_user\x18\x02 \x01(\tR\x07sshUser\x12 \n\x0cssh_key_file\x18\x03 \x01(\tR\nsshKeyFile\"\x83\x03\n\x08VmConfig\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x39\n\x06labels\x18\x02 \x03(\x0b\x32!.iris.config.VmConfig.LabelsEntryR\x06labels\x12?\n\x08metadata\x18\x03 \x03(\x0b\x32#.iris.config.VmConfig.MetadataEntryR\x08metadata\x12,\n\x03gcp\x18\n \x01(\x0b\x32\x18.iris.config.GcpVmConfigH\x00R\x03gcp\x12\x35\n\x06manual\x18\x0b \x01(\x0b\x32\x1b.iris.config.ManualVmConfigH\x00R\x06manual\x1a\x39\n\x0bLabelsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\n\n\x08platform\"\xc4\x02\n\x0eGcpSliceConfig\x12<\n\x04mode\x18\x06 \x01(\x0e\x32(.iris.config.GcpSliceConfig.GcpSliceModeR\x04mode\x12\x12\n\x04zone\x18\x01 \x01(\tR\x04zone\x12\'\n\x0fruntime_version\x18\x02 \x01(\tR\x0eruntimeVersion\x12\x1a\n\x08topology\x18\x04 \x01(\tR\x08topology\x12!\n\x0cmachine_type\x18\x07 \x01(\tR\x0bmachineType\x12\'\n\x0fservice_account\x18\t \x01(\tR\x0eserviceAccount\"=\n\x0cGcpSliceMode\x12\x16\n\x12GCP_SLICE_MODE_TPU\x10\x00\x12\x15\n\x11GCP_SLICE_MODE_VM\x10\x01J\x04\x08\x03\x10\x04J\x04\x08\x05\x10\x06J\x04\x08\x08\x10\t\"\x90\x01\n\x14\x43oreweaveSliceConfig\x12\x16\n\x06region\x18\x01 \x01(\tR\x06region\x12#\n\rinstance_type\x18\x02 \x01(\tR\x0cinstanceType\x12\x1b\n\tgpu_class\x18\x04 \x01(\tR\x08gpuClass\x12\x1e\n\ninfiniband\x18\x05 \x01(\x08R\ninfiniband\"f\n\x11ManualSliceConfig\x12\x14\n\x05hosts\x18\x01 \x03(\tR\x05hosts\x12\x19\n\x08ssh_user\x18\x02 \x01(\tR\x07sshUser\x12 \n\x0cssh_key_file\x18\x03 \x01(\tR\nsshKeyFile\"\x12\n\x10LocalSliceConfig\"\xb0\x05\n\x0bSliceConfig\x12\x1f\n\x0bname_prefix\x18\x01 \x01(\tR\nnamePrefix\x12\x17\n\x07num_vms\x18\x02 \x01(\x05R\x06numVms\x12G\n\x10\x61\x63\x63\x65lerator_type\x18\x03 \x01(\x0e\x32\x1c.iris.config.AcceleratorTypeR\x0f\x61\x63\x63\x65leratorType\x12/\n\x13\x61\x63\x63\x65lerator_variant\x18\x04 \x01(\tR\x12\x61\x63\x63\x65leratorVariant\x12<\n\x06labels\x18\x05 \x03(\x0b\x32$.iris.config.SliceConfig.LabelsEntryR\x06labels\x12\x1b\n\tgpu_count\x18\x07 \x01(\x05R\x08gpuCount\x12 \n\x0c\x64isk_size_gb\x18\x08 \x01(\x05R\ndiskSizeGb\x12>\n\rcapacity_type\x18\x0e \x01(\x0e\x32\x19.iris.config.CapacityTypeR\x0c\x63\x61pacityType\x12/\n\x03gcp\x18\n \x01(\x0b\x32\x1b.iris.config.GcpSliceConfigH\x00R\x03gcp\x12\x41\n\tcoreweave\x18\x0b \x01(\x0b\x32!.iris.config.CoreweaveSliceConfigH\x00R\tcoreweave\x12\x38\n\x06manual\x18\x0c \x01(\x0b\x32\x1e.iris.config.ManualSliceConfigH\x00R\x06manual\x12\x35\n\x05local\x18\r \x01(\x0b\x32\x1d.iris.config.LocalSliceConfigH\x00R\x05local\x1a\x39\n\x0bLabelsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\n\n\x08platformJ\x04\x08\x06\x10\x07\"\xd9\x02\n\x13ScaleGroupResources\x12%\n\x0e\x63pu_millicores\x18\x01 \x01(\x05R\rcpuMillicores\x12!\n\x0cmemory_bytes\x18\x02 \x01(\x03R\x0bmemoryBytes\x12\x1d\n\ndisk_bytes\x18\x03 \x01(\x03R\tdiskBytes\x12=\n\x0b\x64\x65vice_type\x18\n \x01(\x0e\x32\x1c.iris.config.AcceleratorTypeR\ndeviceType\x12%\n\x0e\x64\x65vice_variant\x18\x0b \x01(\tR\rdeviceVariant\x12!\n\x0c\x64\x65vice_count\x18\x0c \x01(\x05R\x0b\x64\x65viceCount\x12>\n\rcapacity_type\x18\x15 \x01(\x0e\x32\x19.iris.config.CapacityTypeR\x0c\x63\x61pacityTypeJ\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x14\x10\x15\"\x9c\x01\n\x0eWorkerSettings\x12K\n\nattributes\x18\x01 \x03(\x0b\x32+.iris.config.WorkerSettings.AttributesEntryR\nattributes\x1a=\n\x0f\x41ttributesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\"\xe0\x04\n\x10ScaleGroupConfig\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12*\n\rbuffer_slices\x18\x03 \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x0c\x62ufferSlices\x12$\n\nmax_slices\x18\x04 \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\tmaxSlices\x12>\n\tresources\x18\x0e \x01(\x0b\x32 .iris.config.ScaleGroupResourcesR\tresources\x12\x1e\n\x07num_vms\x18\x0f \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x06numVms\x12!\n\x08priority\x18\x1e \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x08priority\x12\x34\n\x13scale_up_rate_limit\x18\x46 \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x10scaleUpRateLimit\x12\x38\n\x15scale_down_rate_limit\x18G \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x12scaleDownRateLimit\x12?\n\x0eslice_template\x18\x32 \x01(\x0b\x32\x18.iris.config.SliceConfigR\rsliceTemplate\x12\x33\n\x06worker\x18< \x01(\x0b\x32\x1b.iris.config.WorkerSettingsR\x06worker\x12\x1d\n\nquota_pool\x18P \x01(\tR\tquotaPool\x12.\n\x0f\x61llocation_tier\x18Q \x01(\x05\x42\x05\xaa\x01\x02\x08\x01R\x0e\x61llocationTierJ\x04\x08\x02\x10\x03J\x04\x08\t\x10\nJ\x04\x08\n\x10\x0bJ\x04\x08\x0b\x10\x0cJ\x04\x08\x0c\x10\rJ\x04\x08\r\x10\x0eJ\x04\x08\x14\x10\x15J\x04\x08(\x10)\"\xbe\x08\n\x0cWorkerConfig\x12!\n\x0c\x64ocker_image\x18\x01 \x01(\tR\x0b\x64ockerImage\x12\x12\n\x04host\x18\x02 \x01(\tR\x04host\x12\x12\n\x04port\x18\x03 \x01(\x05R\x04port\x12\x1d\n\nport_range\x18\x04 \x01(\tR\tportRange\x12\x1b\n\tworker_id\x18\x05 \x01(\tR\x08workerId\x12-\n\x12\x63ontroller_address\x18\x06 \x01(\tR\x11\x63ontrollerAddress\x12\x1b\n\tcache_dir\x18\x07 \x01(\tR\x08\x63\x61\x63heDir\x12,\n\x12\x64\x65\x66\x61ult_task_image\x18\t \x01(\tR\x10\x64\x65\x66\x61ultTaskImage\x12\x41\n\x08task_env\x18\n \x03(\x0b\x32&.iris.config.WorkerConfig.TaskEnvEntryR\x07taskEnv\x12\x18\n\x07runtime\x18\x0b \x01(\tR\x07runtime\x12G\n\x10\x61\x63\x63\x65lerator_type\x18\x14 \x01(\x0e\x32\x1c.iris.config.AcceleratorTypeR\x0f\x61\x63\x63\x65leratorType\x12/\n\x13\x61\x63\x63\x65lerator_variant\x18\x15 \x01(\tR\x12\x61\x63\x63\x65leratorVariant\x12\x1b\n\tgpu_count\x18\x16 \x01(\x05R\x08gpuCount\x12>\n\rcapacity_type\x18\x18 \x01(\x0e\x32\x19.iris.config.CapacityTypeR\x0c\x63\x61pacityType\x12\\\n\x11worker_attributes\x18\x1e \x03(\x0b\x32/.iris.config.WorkerConfig.WorkerAttributesEntryR\x10workerAttributes\x12\x38\n\rpoll_interval\x18( \x01(\x0b\x32\x13.iris.time.DurationR\x0cpollInterval\x12@\n\x11heartbeat_timeout\x18) \x01(\x0b\x32\x13.iris.time.DurationR\x10heartbeatTimeout\x12\x19\n\x08slice_id\x18\x08 \x01(\tR\x07sliceId\x12\x37\n\x08platform\x18\x32 \x01(\x0b\x32\x1b.iris.config.PlatformConfigR\x08platform\x12%\n\x0estorage_prefix\x18< \x01(\tR\rstoragePrefix\x12\x1d\n\nauth_token\x18\x46 \x01(\tR\tauthToken\x1a:\n\x0cTaskEnvEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x43\n\x15WorkerAttributesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01J\x04\x08\x17\x10\x18\"\xf2\x02\n\tSshConfig\x12\x12\n\x04user\x18\x01 \x01(\tR\x04user\x12\x19\n\x08key_file\x18\x02 \x01(\tR\x07keyFile\x12\x12\n\x04port\x18\x03 \x01(\x05R\x04port\x12<\n\x0f\x63onnect_timeout\x18\x04 \x01(\x0b\x32\x13.iris.time.DurationR\x0e\x63onnectTimeout\x12<\n\tauth_mode\x18\x05 \x01(\x0e\x32\x1f.iris.config.SshConfig.AuthModeR\x08\x61uthMode\x12\"\n\ros_login_user\x18\x06 \x01(\tR\x0bosLoginUser\x12>\n\x1bimpersonate_service_account\x18\x07 \x01(\tR\x19impersonateServiceAccount\"B\n\x08\x41uthMode\x12\x1a\n\x16SSH_AUTH_MODE_METADATA\x10\x00\x12\x1a\n\x16SSH_AUTH_MODE_OS_LOGIN\x10\x01\"a\n\rStorageConfig\x12&\n\x0flocal_state_dir\x18\x01 \x01(\tR\rlocalStateDir\x12(\n\x10remote_state_dir\x18\x02 \x01(\tR\x0eremoteStateDir\"\xb4\x01\n\x13GcpControllerConfig\x12\x12\n\x04zone\x18\x01 \x01(\tR\x04zone\x12!\n\x0cmachine_type\x18\x02 \x01(\tR\x0bmachineType\x12)\n\x11\x62oot_disk_size_gb\x18\x03 \x01(\x05R\x0e\x62ootDiskSizeGb\x12\x12\n\x04port\x18\x04 \x01(\x05R\x04port\x12\'\n\x0fservice_account\x18\x05 \x01(\tR\x0eserviceAccount\"@\n\x16ManualControllerConfig\x12\x12\n\x04host\x18\x01 \x01(\tR\x04host\x12\x12\n\x04port\x18\x03 \x01(\x05R\x04port\"+\n\x15LocalControllerConfig\x12\x12\n\x04port\x18\x01 \x01(\x05R\x04port\"s\n\x19\x43oreweaveControllerConfig\x12\x12\n\x04port\x18\x01 \x01(\x05R\x04port\x12!\n\x0cservice_name\x18\x02 \x01(\tR\x0bserviceName\x12\x1f\n\x0bscale_group\x18\x03 \x01(\tR\nscaleGroup\"\xdd\x03\n\x12\x43ontrollerVmConfig\x12\x14\n\x05image\x18\n \x01(\tR\x05image\x12:\n\x0eworker_timeout\x18\x0c \x01(\x0b\x32\x13.iris.time.DurationR\rworkerTimeout\x12>\n\x1bheartbeat_failure_threshold\x18\r \x01(\x05R\x19heartbeatFailureThreshold\x12.\n\x13use_split_heartbeat\x18\x0e \x01(\x08R\x11useSplitHeartbeat\x12\x34\n\x03gcp\x18\x01 \x01(\x0b\x32 .iris.config.GcpControllerConfigH\x00R\x03gcp\x12=\n\x06manual\x18\x02 \x01(\x0b\x32#.iris.config.ManualControllerConfigH\x00R\x06manual\x12:\n\x05local\x18\x03 \x01(\x0b\x32\".iris.config.LocalControllerConfigH\x00R\x05local\x12\x46\n\tcoreweave\x18\x04 \x01(\x0b\x32&.iris.config.CoreweaveControllerConfigH\x00R\tcoreweaveB\x0c\n\ncontroller\"\xe4\x02\n\x10\x41utoscalerConfig\x12\x44\n\x13\x65valuation_interval\x18\x01 \x01(\x0b\x32\x13.iris.time.DurationR\x12\x65valuationInterval\x12\x39\n\x0escale_up_delay\x18\x03 \x01(\x0b\x32\x13.iris.time.DurationR\x0cscaleUpDelay\x12=\n\x10scale_down_delay\x18\x04 \x01(\x0b\x32\x13.iris.time.DurationR\x0escaleDownDelay\x12\x45\n\x14startup_grace_period\x18\x05 \x01(\x0b\x32\x13.iris.time.DurationR\x12startupGracePeriod\x12I\n\x16heartbeat_grace_period\x18\x06 \x01(\x0b\x32\x13.iris.time.DurationR\x14heartbeatGracePeriod\"\xb9\x02\n\x0e\x44\x65\x66\x61ultsConfig\x12(\n\x03ssh\x18\x02 \x01(\x0b\x32\x16.iris.config.SshConfigR\x03ssh\x12=\n\nautoscaler\x18\x03 \x01(\x0b\x32\x1d.iris.config.AutoscalerConfigR\nautoscaler\x12\x31\n\x06worker\x18\x06 \x01(\x0b\x32\x19.iris.config.WorkerConfigR\x06worker\x12\x43\n\x08task_env\x18\x07 \x03(\x0b\x32(.iris.config.DefaultsConfig.TaskEnvEntryR\x07taskEnv\x1a:\n\x0cTaskEnvEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06\".\n\rGcpAuthConfig\x12\x1d\n\nproject_id\x18\x01 \x01(\tR\tprojectId\"\x90\x01\n\x10StaticAuthConfig\x12\x41\n\x06tokens\x18\x01 \x03(\x0b\x32).iris.config.StaticAuthConfig.TokensEntryR\x06tokens\x1a\x39\n\x0bTokensEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\"\xbe\x01\n\nAuthConfig\x12.\n\x03gcp\x18\x01 \x01(\x0b\x32\x1a.iris.config.GcpAuthConfigH\x00R\x03gcp\x12\x37\n\x06static\x18\x02 \x01(\x0b\x32\x1d.iris.config.StaticAuthConfigH\x00R\x06static\x12\x1f\n\x0b\x61\x64min_users\x18\x03 \x03(\tR\nadminUsers\x12\x1a\n\x08optional\x18\x04 \x01(\x08R\x08optionalB\n\n\x08provider\"\x16\n\x14WorkerProviderConfig\"\xcd\x02\n\x18KubernetesProviderConfig\x12\x1c\n\tnamespace\x18\x01 \x01(\tR\tnamespace\x12\x1e\n\nkubeconfig\x18\x02 \x01(\tR\nkubeconfig\x12#\n\rdefault_image\x18\x03 \x01(\tR\x0c\x64\x65\x66\x61ultImage\x12\x36\n\x17\x63olocation_topology_key\x18\x04 \x01(\tR\x15\x63olocationTopologyKey\x12\'\n\x0fservice_account\x18\x05 \x01(\tR\x0eserviceAccount\x12!\n\x0chost_network\x18\x06 \x01(\x08R\x0bhostNetwork\x12\x1b\n\tcache_dir\x18\x07 \x01(\tR\x08\x63\x61\x63heDir\x12-\n\x12\x63ontroller_address\x18\x08 \x01(\tR\x11\x63ontrollerAddress\"\xa4\x05\n\x11IrisClusterConfig\x12\x12\n\x04name\x18\x05 \x01(\tR\x04name\x12\x37\n\x08platform\x18\n \x01(\x0b\x32\x1b.iris.config.PlatformConfigR\x08platform\x12\x37\n\x08\x64\x65\x66\x61ults\x18\x0b \x01(\x0b\x32\x1b.iris.config.DefaultsConfigR\x08\x64\x65\x66\x61ults\x12\x34\n\x07storage\x18\x0c \x01(\x0b\x32\x1a.iris.config.StorageConfigR\x07storage\x12?\n\ncontroller\x18\x1f \x01(\x0b\x32\x1f.iris.config.ControllerVmConfigR\ncontroller\x12R\n\x0cscale_groups\x18\x32 \x03(\x0b\x32/.iris.config.IrisClusterConfig.ScaleGroupsEntryR\x0bscaleGroups\x12+\n\x04\x61uth\x18< \x01(\x0b\x32\x17.iris.config.AuthConfigR\x04\x61uth\x12X\n\x13kubernetes_provider\x18\x46 \x01(\x0b\x32%.iris.config.KubernetesProviderConfigH\x00R\x12kubernetesProvider\x12L\n\x0fworker_provider\x18G \x01(\x0b\x32!.iris.config.WorkerProviderConfigH\x00R\x0eworkerProvider\x1a]\n\x10ScaleGroupsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x33\n\x05value\x18\x02 \x01(\x0b\x32\x1d.iris.config.ScaleGroupConfigR\x05value:\x02\x38\x01\x42\n\n\x08provider*\x81\x01\n\x0f\x41\x63\x63\x65leratorType\x12 \n\x1c\x41\x43\x43\x45LERATOR_TYPE_UNSPECIFIED\x10\x00\x12\x18\n\x14\x41\x43\x43\x45LERATOR_TYPE_CPU\x10\x01\x12\x18\n\x14\x41\x43\x43\x45LERATOR_TYPE_GPU\x10\x02\x12\x18\n\x14\x41\x43\x43\x45LERATOR_TYPE_TPU\x10\x03*\x85\x01\n\x0c\x43\x61pacityType\x12\x1d\n\x19\x43\x41PACITY_TYPE_UNSPECIFIED\x10\x00\x12\x1d\n\x19\x43\x41PACITY_TYPE_PREEMPTIBLE\x10\x01\x12\x1b\n\x17\x43\x41PACITY_TYPE_ON_DEMAND\x10\x02\x12\x1a\n\x16\x43\x41PACITY_TYPE_RESERVED\x10\x03\x42p\n\x0f\x63om.iris.configB\x0b\x43onfigProtoP\x01\xa2\x02\x03ICX\xaa\x02\x0bIris.Config\xca\x02\x0bIris\\Config\xe2\x02\x17Iris\\Config\\GPBMetadata\xea\x02\x0cIris::Config\x92\x03\x02\x08\x01\x62\x08\x65\x64itionsp\xe8\x07') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -65,10 +65,10 @@ _globals['_STATICAUTHCONFIG_TOKENSENTRY']._serialized_options = b'8\001' _globals['_IRISCLUSTERCONFIG_SCALEGROUPSENTRY']._loaded_options = None _globals['_IRISCLUSTERCONFIG_SCALEGROUPSENTRY']._serialized_options = b'8\001' - _globals['_ACCELERATORTYPE']._serialized_start=8307 - _globals['_ACCELERATORTYPE']._serialized_end=8436 - _globals['_CAPACITYTYPE']._serialized_start=8439 - _globals['_CAPACITYTYPE']._serialized_end=8572 + _globals['_ACCELERATORTYPE']._serialized_start=8355 + _globals['_ACCELERATORTYPE']._serialized_end=8484 + _globals['_CAPACITYTYPE']._serialized_start=8487 + _globals['_CAPACITYTYPE']._serialized_end=8620 _globals['_GCPPLATFORMCONFIG']._serialized_start=41 _globals['_GCPPLATFORMCONFIG']._serialized_end=113 _globals['_MANUALPLATFORMCONFIG']._serialized_start=115 @@ -134,27 +134,27 @@ _globals['_COREWEAVECONTROLLERCONFIG']._serialized_start=5655 _globals['_COREWEAVECONTROLLERCONFIG']._serialized_end=5770 _globals['_CONTROLLERVMCONFIG']._serialized_start=5773 - _globals['_CONTROLLERVMCONFIG']._serialized_end=6202 - _globals['_AUTOSCALERCONFIG']._serialized_start=6205 - _globals['_AUTOSCALERCONFIG']._serialized_end=6561 - _globals['_DEFAULTSCONFIG']._serialized_start=6564 - _globals['_DEFAULTSCONFIG']._serialized_end=6877 + _globals['_CONTROLLERVMCONFIG']._serialized_end=6250 + _globals['_AUTOSCALERCONFIG']._serialized_start=6253 + _globals['_AUTOSCALERCONFIG']._serialized_end=6609 + _globals['_DEFAULTSCONFIG']._serialized_start=6612 + _globals['_DEFAULTSCONFIG']._serialized_end=6925 _globals['_DEFAULTSCONFIG_TASKENVENTRY']._serialized_start=4754 _globals['_DEFAULTSCONFIG_TASKENVENTRY']._serialized_end=4812 - _globals['_GCPAUTHCONFIG']._serialized_start=6879 - _globals['_GCPAUTHCONFIG']._serialized_end=6925 - _globals['_STATICAUTHCONFIG']._serialized_start=6928 - _globals['_STATICAUTHCONFIG']._serialized_end=7072 - _globals['_STATICAUTHCONFIG_TOKENSENTRY']._serialized_start=7015 - _globals['_STATICAUTHCONFIG_TOKENSENTRY']._serialized_end=7072 - _globals['_AUTHCONFIG']._serialized_start=7075 - _globals['_AUTHCONFIG']._serialized_end=7265 - _globals['_WORKERPROVIDERCONFIG']._serialized_start=7267 - _globals['_WORKERPROVIDERCONFIG']._serialized_end=7289 - _globals['_KUBERNETESPROVIDERCONFIG']._serialized_start=7292 - _globals['_KUBERNETESPROVIDERCONFIG']._serialized_end=7625 - _globals['_IRISCLUSTERCONFIG']._serialized_start=7628 - _globals['_IRISCLUSTERCONFIG']._serialized_end=8304 - _globals['_IRISCLUSTERCONFIG_SCALEGROUPSENTRY']._serialized_start=8199 - _globals['_IRISCLUSTERCONFIG_SCALEGROUPSENTRY']._serialized_end=8292 + _globals['_GCPAUTHCONFIG']._serialized_start=6927 + _globals['_GCPAUTHCONFIG']._serialized_end=6973 + _globals['_STATICAUTHCONFIG']._serialized_start=6976 + _globals['_STATICAUTHCONFIG']._serialized_end=7120 + _globals['_STATICAUTHCONFIG_TOKENSENTRY']._serialized_start=7063 + _globals['_STATICAUTHCONFIG_TOKENSENTRY']._serialized_end=7120 + _globals['_AUTHCONFIG']._serialized_start=7123 + _globals['_AUTHCONFIG']._serialized_end=7313 + _globals['_WORKERPROVIDERCONFIG']._serialized_start=7315 + _globals['_WORKERPROVIDERCONFIG']._serialized_end=7337 + _globals['_KUBERNETESPROVIDERCONFIG']._serialized_start=7340 + _globals['_KUBERNETESPROVIDERCONFIG']._serialized_end=7673 + _globals['_IRISCLUSTERCONFIG']._serialized_start=7676 + _globals['_IRISCLUSTERCONFIG']._serialized_end=8352 + _globals['_IRISCLUSTERCONFIG_SCALEGROUPSENTRY']._serialized_start=8247 + _globals['_IRISCLUSTERCONFIG_SCALEGROUPSENTRY']._serialized_end=8340 # @@protoc_insertion_point(module_scope) diff --git a/lib/iris/src/iris/rpc/config_pb2.pyi b/lib/iris/src/iris/rpc/config_pb2.pyi index f7275b74ea..885677c90e 100644 --- a/lib/iris/src/iris/rpc/config_pb2.pyi +++ b/lib/iris/src/iris/rpc/config_pb2.pyi @@ -405,10 +405,11 @@ class CoreweaveControllerConfig(_message.Message): def __init__(self, port: _Optional[int] = ..., service_name: _Optional[str] = ..., scale_group: _Optional[str] = ...) -> None: ... class ControllerVmConfig(_message.Message): - __slots__ = ("image", "worker_timeout", "heartbeat_failure_threshold", "gcp", "manual", "local", "coreweave") + __slots__ = ("image", "worker_timeout", "heartbeat_failure_threshold", "use_split_heartbeat", "gcp", "manual", "local", "coreweave") IMAGE_FIELD_NUMBER: _ClassVar[int] WORKER_TIMEOUT_FIELD_NUMBER: _ClassVar[int] HEARTBEAT_FAILURE_THRESHOLD_FIELD_NUMBER: _ClassVar[int] + USE_SPLIT_HEARTBEAT_FIELD_NUMBER: _ClassVar[int] GCP_FIELD_NUMBER: _ClassVar[int] MANUAL_FIELD_NUMBER: _ClassVar[int] LOCAL_FIELD_NUMBER: _ClassVar[int] @@ -416,11 +417,12 @@ class ControllerVmConfig(_message.Message): image: str worker_timeout: _time_pb2.Duration heartbeat_failure_threshold: int + use_split_heartbeat: bool gcp: GcpControllerConfig manual: ManualControllerConfig local: LocalControllerConfig coreweave: CoreweaveControllerConfig - def __init__(self, image: _Optional[str] = ..., worker_timeout: _Optional[_Union[_time_pb2.Duration, _Mapping]] = ..., heartbeat_failure_threshold: _Optional[int] = ..., gcp: _Optional[_Union[GcpControllerConfig, _Mapping]] = ..., manual: _Optional[_Union[ManualControllerConfig, _Mapping]] = ..., local: _Optional[_Union[LocalControllerConfig, _Mapping]] = ..., coreweave: _Optional[_Union[CoreweaveControllerConfig, _Mapping]] = ...) -> None: ... + def __init__(self, image: _Optional[str] = ..., worker_timeout: _Optional[_Union[_time_pb2.Duration, _Mapping]] = ..., heartbeat_failure_threshold: _Optional[int] = ..., use_split_heartbeat: _Optional[bool] = ..., gcp: _Optional[_Union[GcpControllerConfig, _Mapping]] = ..., manual: _Optional[_Union[ManualControllerConfig, _Mapping]] = ..., local: _Optional[_Union[LocalControllerConfig, _Mapping]] = ..., coreweave: _Optional[_Union[CoreweaveControllerConfig, _Mapping]] = ...) -> None: ... class AutoscalerConfig(_message.Message): __slots__ = ("evaluation_interval", "scale_up_delay", "scale_down_delay", "startup_grace_period", "heartbeat_grace_period") diff --git a/lib/iris/src/iris/rpc/controller.proto b/lib/iris/src/iris/rpc/controller.proto index 6e61551323..c0eaf14c83 100644 --- a/lib/iris/src/iris/rpc/controller.proto +++ b/lib/iris/src/iris/rpc/controller.proto @@ -510,6 +510,14 @@ message Controller { repeated GetUserBudgetResponse users = 1; } + // --- Heartbeat Refactor: Worker pushes task state transitions --- + message UpdateTaskStatusRequest { + string worker_id = 1; + repeated iris.job.WorkerTaskStatus updates = 2; + } + message UpdateTaskStatusResponse { + } + // --- Scheduler State --- message GetSchedulerStateRequest {} @@ -637,4 +645,7 @@ service ControllerService { // Scheduler state (dashboard) rpc GetSchedulerState(Controller.GetSchedulerStateRequest) returns (Controller.GetSchedulerStateResponse); + + // Worker-to-controller task status push + rpc UpdateTaskStatus(Controller.UpdateTaskStatusRequest) returns (Controller.UpdateTaskStatusResponse); } diff --git a/lib/iris/src/iris/rpc/controller_connect.py b/lib/iris/src/iris/rpc/controller_connect.py index 113762233c..c5ec544e30 100644 --- a/lib/iris/src/iris/rpc/controller_connect.py +++ b/lib/iris/src/iris/rpc/controller_connect.py @@ -124,6 +124,9 @@ async def list_user_budgets(self, request: controller__pb2.Controller.ListUserBu async def get_scheduler_state(self, request: controller__pb2.Controller.GetSchedulerStateRequest, ctx: RequestContext) -> controller__pb2.Controller.GetSchedulerStateResponse: raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + async def update_task_status(self, request: controller__pb2.Controller.UpdateTaskStatusRequest, ctx: RequestContext) -> controller__pb2.Controller.UpdateTaskStatusResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + class ControllerServiceASGIApplication(ConnectASGIApplication[ControllerService]): def __init__(self, service: ControllerService | AsyncGenerator[ControllerService], *, interceptors: Iterable[Interceptor]=(), read_max_bytes: int | None = None, compressions: Iterable[Compression] | None = None) -> None: @@ -480,6 +483,16 @@ def __init__(self, service: ControllerService | AsyncGenerator[ControllerService ), function=svc.get_scheduler_state, ), + "/iris.cluster.ControllerService/UpdateTaskStatus": Endpoint.unary( + method=MethodInfo( + name="UpdateTaskStatus", + service_name="iris.cluster.ControllerService", + input=controller__pb2.Controller.UpdateTaskStatusRequest, + output=controller__pb2.Controller.UpdateTaskStatusResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=svc.update_task_status, + ), }, interceptors=interceptors, read_max_bytes=read_max_bytes, @@ -1193,6 +1206,26 @@ async def get_scheduler_state( timeout_ms=timeout_ms, ) + async def update_task_status( + self, + request: controller__pb2.Controller.UpdateTaskStatusRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> controller__pb2.Controller.UpdateTaskStatusResponse: + return await self.execute_unary( + request=request, + method=MethodInfo( + name="UpdateTaskStatus", + service_name="iris.cluster.ControllerService", + input=controller__pb2.Controller.UpdateTaskStatusRequest, + output=controller__pb2.Controller.UpdateTaskStatusResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) + class ControllerServiceSync(Protocol): def launch_job(self, request: controller__pb2.Controller.LaunchJobRequest, ctx: RequestContext) -> controller__pb2.Controller.LaunchJobResponse: @@ -1265,6 +1298,8 @@ def list_user_budgets(self, request: controller__pb2.Controller.ListUserBudgetsR raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") def get_scheduler_state(self, request: controller__pb2.Controller.GetSchedulerStateRequest, ctx: RequestContext) -> controller__pb2.Controller.GetSchedulerStateResponse: raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + def update_task_status(self, request: controller__pb2.Controller.UpdateTaskStatusRequest, ctx: RequestContext) -> controller__pb2.Controller.UpdateTaskStatusResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") class ControllerServiceWSGIApplication(ConnectWSGIApplication): @@ -1621,6 +1656,16 @@ def __init__(self, service: ControllerServiceSync, interceptors: Iterable[Interc ), function=service.get_scheduler_state, ), + "/iris.cluster.ControllerService/UpdateTaskStatus": EndpointSync.unary( + method=MethodInfo( + name="UpdateTaskStatus", + service_name="iris.cluster.ControllerService", + input=controller__pb2.Controller.UpdateTaskStatusRequest, + output=controller__pb2.Controller.UpdateTaskStatusResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=service.update_task_status, + ), }, interceptors=interceptors, read_max_bytes=read_max_bytes, @@ -2333,3 +2378,23 @@ def get_scheduler_state( headers=headers, timeout_ms=timeout_ms, ) + + def update_task_status( + self, + request: controller__pb2.Controller.UpdateTaskStatusRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> controller__pb2.Controller.UpdateTaskStatusResponse: + return self.execute_unary( + request=request, + method=MethodInfo( + name="UpdateTaskStatus", + service_name="iris.cluster.ControllerService", + input=controller__pb2.Controller.UpdateTaskStatusRequest, + output=controller__pb2.Controller.UpdateTaskStatusResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) diff --git a/lib/iris/src/iris/rpc/controller_pb2.py b/lib/iris/src/iris/rpc/controller_pb2.py index 9e9915772d..e8bfaf1e36 100644 --- a/lib/iris/src/iris/rpc/controller_pb2.py +++ b/lib/iris/src/iris/rpc/controller_pb2.py @@ -29,7 +29,7 @@ from . import vm_pb2 as vm__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10\x63ontroller.proto\x12\x0ciris.cluster\x1a\tjob.proto\x1a\rlogging.proto\x1a\x0bquery.proto\x1a\ntime.proto\x1a\x08vm.proto\"\xaeY\n\nController\x1a\xc7\x08\n\x10LaunchJobRequest\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12;\n\nentrypoint\x18\x02 \x01(\x0b\x32\x1b.iris.job.RuntimeEntrypointR\nentrypoint\x12\x39\n\tresources\x18\x03 \x01(\x0b\x32\x1b.iris.job.ResourceSpecProtoR\tresources\x12=\n\x0b\x65nvironment\x18\x04 \x01(\x0b\x32\x1b.iris.job.EnvironmentConfigR\x0b\x65nvironment\x12\x1b\n\tbundle_id\x18\x05 \x01(\tR\x08\x62undleId\x12\x1f\n\x0b\x62undle_blob\x18\x06 \x01(\x0cR\nbundleBlob\x12\x42\n\x12scheduling_timeout\x18\x08 \x01(\x0b\x32\x13.iris.time.DurationR\x11schedulingTimeout\x12\x14\n\x05ports\x18\t \x03(\tR\x05ports\x12*\n\x11max_task_failures\x18\x0b \x01(\x05R\x0fmaxTaskFailures\x12.\n\x13max_retries_failure\x18\x0c \x01(\x05R\x11maxRetriesFailure\x12\x34\n\x16max_retries_preemption\x18\r \x01(\x05R\x14maxRetriesPreemption\x12\x36\n\x0b\x63onstraints\x18\x0e \x03(\x0b\x32\x14.iris.job.ConstraintR\x0b\x63onstraints\x12@\n\x0c\x63oscheduling\x18\x0f \x01(\x0b\x32\x1c.iris.job.CoschedulingConfigR\x0c\x63oscheduling\x12\x1a\n\x08replicas\x18\x14 \x01(\x05R\x08replicas\x12-\n\x07timeout\x18\x15 \x01(\x0b\x32\x13.iris.time.DurationR\x07timeout\x12$\n\x0e\x66\x61il_if_exists\x18\x16 \x01(\x08R\x0c\x66\x61ilIfExists\x12=\n\x0breservation\x18\x1e \x01(\x0b\x32\x1b.iris.job.ReservationConfigR\x0breservation\x12J\n\x11preemption_policy\x18\x1f \x01(\x0e\x32\x1d.iris.job.JobPreemptionPolicyR\x10preemptionPolicy\x12K\n\x13\x65xisting_job_policy\x18 \x01(\x0e\x32\x1b.iris.job.ExistingJobPolicyR\x11\x65xistingJobPolicy\x12;\n\rpriority_band\x18! \x01(\x0e\x32\x16.iris.job.PriorityBandR\x0cpriorityBand\x12\x1d\n\ntask_image\x18\" \x01(\tR\ttaskImage\x12\x1f\n\x0bsubmit_argv\x18# \x03(\tR\nsubmitArgv\x1a*\n\x11LaunchJobResponse\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a\x38\n\x13GetJobStatusRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobIdJ\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04\x1a\xfa\x01\n\x14GetJobStatusResponse\x12%\n\x03job\x18\x01 \x01(\x0b\x32\x13.iris.job.JobStatusR\x03job\x12\x43\n\x07request\x18\x02 \x01(\x0b\x32).iris.cluster.Controller.LaunchJobRequestR\x07request\x12:\n\x0cresource_min\x18\x03 \x01(\x0b\x32\x17.iris.job.ResourceUsageR\x0bresourceMin\x12:\n\x0cresource_max\x18\x04 \x01(\x0b\x32\x17.iris.job.ResourceUsageR\x0bresourceMax\x1a-\n\x12GetJobStateRequest\x12\x17\n\x07job_ids\x18\x01 \x03(\tR\x06jobIds\x1a\xb6\x01\n\x13GetJobStateResponse\x12P\n\x06states\x18\x01 \x03(\x0b\x32\x38.iris.cluster.Controller.GetJobStateResponse.StatesEntryR\x06states\x1aM\n\x0bStatesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12(\n\x05value\x18\x02 \x01(\x0e\x32\x12.iris.job.JobStateR\x05value:\x02\x38\x01\x1a,\n\x13TerminateJobRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a\xf3\x02\n\x08JobQuery\x12<\n\x05scope\x18\x01 \x01(\x0e\x32&.iris.cluster.Controller.JobQueryScopeR\x05scope\x12\"\n\rparent_job_id\x18\x02 \x01(\tR\x0bparentJobId\x12\x1f\n\x0bname_filter\x18\x03 \x01(\tR\nnameFilter\x12!\n\x0cstate_filter\x18\x04 \x01(\tR\x0bstateFilter\x12\x44\n\nsort_field\x18\x05 \x01(\x0e\x32%.iris.cluster.Controller.JobSortFieldR\tsortField\x12M\n\x0esort_direction\x18\x06 \x01(\x0e\x32&.iris.cluster.Controller.SortDirectionR\rsortDirection\x12\x16\n\x06offset\x18\x07 \x01(\x05R\x06offset\x12\x14\n\x05limit\x18\x08 \x01(\x05R\x05limit\x1a\xc9\x01\n\x0fListJobsRequest\x12\x37\n\x05query\x18\t \x01(\x0b\x32!.iris.cluster.Controller.JobQueryR\x05queryJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x07\x10\x08R\x06offsetR\x05limitR\nsort_fieldR\x0esort_directionR\x0bname_filterR\x0cstate_filterR\rparent_job_id\x1aw\n\x10ListJobsResponse\x12\'\n\x04jobs\x18\x01 \x03(\x0b\x32\x13.iris.job.JobStatusR\x04jobs\x12\x1f\n\x0btotal_count\x18\x02 \x01(\x05R\ntotalCount\x12\x19\n\x08has_more\x18\x03 \x01(\x08R\x07hasMore\x1a/\n\x14GetTaskStatusRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x1a\x83\x01\n\x15GetTaskStatusResponse\x12(\n\x04task\x18\x01 \x01(\x0b\x32\x14.iris.job.TaskStatusR\x04task\x12@\n\rjob_resources\x18\x02 \x01(\x0b\x32\x1b.iris.job.ResourceSpecProtoR\x0cjobResources\x1a)\n\x10ListTasksRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a?\n\x11ListTasksResponse\x12*\n\x05tasks\x18\x01 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x05tasks\x1at\n\x16\x45xecInContainerRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x18\n\x07\x63ommand\x18\x02 \x03(\tR\x07\x63ommand\x12\'\n\x0ftimeout_seconds\x18\x03 \x01(\x05R\x0etimeoutSeconds\x1a|\n\x17\x45xecInContainerResponse\x12\x1b\n\texit_code\x18\x01 \x01(\x05R\x08\x65xitCode\x12\x16\n\x06stdout\x18\x02 \x01(\tR\x06stdout\x12\x16\n\x06stderr\x18\x03 \x01(\tR\x06stderr\x12\x14\n\x05\x65rror\x18\x04 \x01(\tR\x05\x65rror\x1a\xb4\x01\n\nWorkerInfo\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x18\n\x07\x61\x64\x64ress\x18\x02 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x03 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12\x39\n\rregistered_at\x18\x04 \x01(\x0b\x32\x14.iris.time.TimestampR\x0cregisteredAt\x1a\xda\x02\n\x12WorkerHealthStatus\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x18\n\x07healthy\x18\x02 \x01(\x08R\x07healthy\x12\x31\n\x14\x63onsecutive_failures\x18\x03 \x01(\x05R\x13\x63onsecutiveFailures\x12;\n\x0elast_heartbeat\x18\x04 \x01(\x0b\x32\x14.iris.time.TimestampR\rlastHeartbeat\x12&\n\x0frunning_job_ids\x18\x05 \x03(\tR\rrunningJobIds\x12\x18\n\x07\x61\x64\x64ress\x18\x06 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x07 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12%\n\x0estatus_message\x18\x08 \x01(\tR\rstatusMessage\x1a\x14\n\x12ListWorkersRequest\x1a\\\n\x13ListWorkersResponse\x12\x45\n\x07workers\x18\x01 \x03(\x0b\x32+.iris.cluster.Controller.WorkerHealthStatusR\x07workers\x1a\xba\x01\n\x0fRegisterRequest\x12\x18\n\x07\x61\x64\x64ress\x18\x01 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x02 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12\x1b\n\tworker_id\x18\x03 \x01(\tR\x08workerId\x12\x19\n\x08slice_id\x18\x04 \x01(\tR\x07sliceId\x12\x1f\n\x0bscale_group\x18\x05 \x01(\tR\nscaleGroup\x1aK\n\x10RegisterResponse\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x02 \x01(\x08R\x08\x61\x63\x63\x65pted\x1a\xfc\x01\n\x08\x45ndpoint\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x18\n\x07\x61\x64\x64ress\x18\x03 \x01(\tR\x07\x61\x64\x64ress\x12\x17\n\x07task_id\x18\x04 \x01(\tR\x06taskId\x12K\n\x08metadata\x18\x05 \x03(\x0b\x32/.iris.cluster.Controller.Endpoint.MetadataEntryR\x08metadata\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\xb9\x02\n\x17RegisterEndpointRequest\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x18\n\x07\x61\x64\x64ress\x18\x02 \x01(\tR\x07\x61\x64\x64ress\x12\x17\n\x07task_id\x18\x03 \x01(\tR\x06taskId\x12Z\n\x08metadata\x18\x04 \x03(\x0b\x32>.iris.cluster.Controller.RegisterEndpointRequest.MetadataEntryR\x08metadata\x12\x1d\n\nattempt_id\x18\x05 \x01(\x05R\tattemptId\x12\x1f\n\x0b\x65ndpoint_id\x18\x06 \x01(\tR\nendpointId\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a;\n\x18RegisterEndpointResponse\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x1a<\n\x19UnregisterEndpointRequest\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x1a\x44\n\x14ListEndpointsRequest\x12\x16\n\x06prefix\x18\x01 \x01(\tR\x06prefix\x12\x14\n\x05\x65xact\x18\x02 \x01(\x08R\x05\x65xact\x1aX\n\x15ListEndpointsResponse\x12?\n\tendpoints\x18\x01 \x03(\x0b\x32!.iris.cluster.Controller.EndpointR\tendpoints\x1a\x1c\n\x1aGetAutoscalerStatusRequest\x1aP\n\x1bGetAutoscalerStatusResponse\x12\x31\n\x06status\x18\x01 \x01(\x0b\x32\x19.iris.vm.AutoscalerStatusR\x06status\x1a\x96\x01\n\x11TransactionAction\x12\x32\n\ttimestamp\x18\x01 \x01(\x0b\x32\x14.iris.time.TimestampR\ttimestamp\x12\x16\n\x06\x61\x63tion\x18\x02 \x01(\tR\x06\x61\x63tion\x12\x1b\n\tentity_id\x18\x03 \x01(\tR\x08\x65ntityId\x12\x18\n\x07\x64\x65tails\x18\x04 \x01(\tR\x07\x64\x65tails\x1a.\n\x16GetTransactionsRequest\x12\x14\n\x05limit\x18\x01 \x01(\x05R\x05limit\x1a_\n\x17GetTransactionsResponse\x12\x44\n\x07\x61\x63tions\x18\x01 \x03(\x0b\x32*.iris.cluster.Controller.TransactionActionR\x07\x61\x63tions\x1a\x18\n\x16\x42\x65ginCheckpointRequest\x1a\xd6\x01\n\x17\x42\x65ginCheckpointResponse\x12\'\n\x0f\x63heckpoint_path\x18\x01 \x01(\tR\x0e\x63heckpointPath\x12\x33\n\ncreated_at\x18\x02 \x01(\x0b\x32\x14.iris.time.TimestampR\tcreatedAt\x12\x1b\n\tjob_count\x18\x03 \x01(\x05R\x08jobCount\x12\x1d\n\ntask_count\x18\x04 \x01(\x05R\ttaskCount\x12!\n\x0cworker_count\x18\x05 \x01(\x05R\x0bworkerCount\x1a\xf3\x02\n\x0bUserSummary\x12\x12\n\x04user\x18\x01 \x01(\tR\x04user\x12\x65\n\x11task_state_counts\x18\x02 \x03(\x0b\x32\x39.iris.cluster.Controller.UserSummary.TaskStateCountsEntryR\x0ftaskStateCounts\x12\x62\n\x10job_state_counts\x18\x03 \x03(\x0b\x32\x38.iris.cluster.Controller.UserSummary.JobStateCountsEntryR\x0ejobStateCounts\x1a\x42\n\x14TaskStateCountsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x05R\x05value:\x02\x38\x01\x1a\x41\n\x13JobStateCountsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x05R\x05value:\x02\x38\x01\x1a\x12\n\x10ListUsersRequest\x1aO\n\x11ListUsersResponse\x12:\n\x05users\x18\x01 \x03(\x0b\x32$.iris.cluster.Controller.UserSummaryR\x05users\x1a\x98\x02\n\x12GetTaskLogsRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12)\n\x10include_children\x18\x02 \x01(\x08R\x0fincludeChildren\x12\x19\n\x08since_ms\x18\x03 \x01(\x03R\x07sinceMs\x12&\n\x0fmax_total_lines\x18\x04 \x01(\x03R\rmaxTotalLines\x12\x1c\n\tsubstring\x18\x05 \x01(\tR\tsubstring\x12\x1d\n\nattempt_id\x18\x06 \x01(\x05R\tattemptId\x12\x1b\n\tmin_level\x18\x07 \x01(\tR\x08minLevel\x12\x16\n\x06\x63ursor\x18\x08 \x01(\x03R\x06\x63ursor\x12\x12\n\x04tail\x18\t \x01(\x08R\x04tail\x1a\x86\x01\n\x0cTaskLogBatch\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12*\n\x04logs\x18\x02 \x03(\x0b\x32\x16.iris.logging.LogEntryR\x04logs\x12\x14\n\x05\x65rror\x18\x03 \x01(\tR\x05\x65rror\x12\x1b\n\tworker_id\x18\x04 \x01(\tR\x08workerId\x1a\xd2\x01\n\x13GetTaskLogsResponse\x12\x42\n\ttask_logs\x18\x01 \x03(\x0b\x32%.iris.cluster.Controller.TaskLogBatchR\x08taskLogs\x12\x1c\n\ttruncated\x18\x02 \x01(\x08R\ttruncated\x12\x41\n\x12\x63hild_job_statuses\x18\x03 \x03(\x0b\x32\x13.iris.job.JobStatusR\x10\x63hildJobStatuses\x12\x16\n\x06\x63ursor\x18\x04 \x01(\x03R\x06\x63ursor\x1a(\n\x16GetWorkerStatusRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x1a\xe8\x03\n\x17GetWorkerStatusResponse\x12\x1f\n\x02vm\x18\x01 \x01(\x0b\x32\x0f.iris.vm.VmInfoR\x02vm\x12\x1f\n\x0bscale_group\x18\x02 \x01(\tR\nscaleGroup\x12\x43\n\x06worker\x18\x03 \x01(\x0b\x32+.iris.cluster.Controller.WorkerHealthStatusR\x06worker\x12%\n\x0e\x62ootstrap_logs\x18\x04 \x01(\tR\rbootstrapLogs\x12\x44\n\x12worker_log_entries\x18\t \x03(\x0b\x32\x16.iris.logging.LogEntryR\x10workerLogEntries\x12\x37\n\x0crecent_tasks\x18\x06 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x0brecentTasks\x12M\n\x11\x63urrent_resources\x18\x07 \x01(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x10\x63urrentResources\x12K\n\x10resource_history\x18\x08 \x03(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x0fresourceHistoryJ\x04\x08\x05\x10\x06\x1a\xce\x01\n\x0fSchedulingEvent\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x1d\n\nattempt_id\x18\x02 \x01(\x05R\tattemptId\x12\x1d\n\nevent_type\x18\x03 \x01(\tR\teventType\x12\x16\n\x06reason\x18\x04 \x01(\tR\x06reason\x12\x18\n\x07message\x18\x05 \x01(\tR\x07message\x12\x32\n\ttimestamp\x18\x06 \x01(\x0b\x32\x14.iris.time.TimestampR\ttimestamp\x1a\x8e\x02\n\x0f\x43lusterCapacity\x12+\n\x11schedulable_nodes\x18\x01 \x01(\x05R\x10schedulableNodes\x12\x30\n\x14total_cpu_millicores\x18\x02 \x01(\x03R\x12totalCpuMillicores\x12\x38\n\x18\x61vailable_cpu_millicores\x18\x03 \x01(\x03R\x16\x61vailableCpuMillicores\x12,\n\x12total_memory_bytes\x18\x04 \x01(\x03R\x10totalMemoryBytes\x12\x34\n\x16\x61vailable_memory_bytes\x18\x05 \x01(\x03R\x14\x61vailableMemoryBytes\x1a\x1a\n\x18GetProviderStatusRequest\x1a\xe8\x01\n\x19GetProviderStatusResponse\x12.\n\x13has_direct_provider\x18\x01 \x01(\x08R\x11hasDirectProvider\x12U\n\x11scheduling_events\x18\x02 \x03(\x0b\x32(.iris.cluster.Controller.SchedulingEventR\x10schedulingEvents\x12\x44\n\x08\x63\x61pacity\x18\x03 \x01(\x0b\x32(.iris.cluster.Controller.ClusterCapacityR\x08\x63\x61pacity\x1a#\n!GetKubernetesClusterStatusRequest\x1a\xed\x01\n\x13KubernetesPodStatus\x12\x19\n\x08pod_name\x18\x01 \x01(\tR\x07podName\x12\x17\n\x07task_id\x18\x02 \x01(\tR\x06taskId\x12\x14\n\x05phase\x18\x03 \x01(\tR\x05phase\x12\x16\n\x06reason\x18\x04 \x01(\tR\x06reason\x12\x18\n\x07message\x18\x05 \x01(\tR\x07message\x12=\n\x0flast_transition\x18\x06 \x01(\x0b\x32\x14.iris.time.TimestampR\x0elastTransition\x12\x1b\n\tnode_name\x18\x07 \x01(\tR\x08nodeName\x1a\x8f\x03\n\x0eNodePoolStatus\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12#\n\rinstance_type\x18\x02 \x01(\tR\x0cinstanceType\x12\x1f\n\x0bscale_group\x18\x03 \x01(\tR\nscaleGroup\x12!\n\x0ctarget_nodes\x18\x04 \x01(\x05R\x0btargetNodes\x12#\n\rcurrent_nodes\x18\x05 \x01(\x05R\x0c\x63urrentNodes\x12!\n\x0cqueued_nodes\x18\x06 \x01(\x05R\x0bqueuedNodes\x12*\n\x11in_progress_nodes\x18\x07 \x01(\x05R\x0finProgressNodes\x12 \n\x0b\x61utoscaling\x18\x08 \x01(\x08R\x0b\x61utoscaling\x12\x1b\n\tmin_nodes\x18\t \x01(\x05R\x08minNodes\x12\x1b\n\tmax_nodes\x18\n \x01(\x05R\x08maxNodes\x12\x1a\n\x08\x63\x61pacity\x18\x0b \x01(\tR\x08\x63\x61pacity\x12\x14\n\x05quota\x18\x0c \x01(\tR\x05quota\x1a\xac\x03\n\"GetKubernetesClusterStatusResponse\x12\x1c\n\tnamespace\x18\x01 \x01(\tR\tnamespace\x12\x1f\n\x0btotal_nodes\x18\x02 \x01(\x05R\ntotalNodes\x12+\n\x11schedulable_nodes\x18\x03 \x01(\x05R\x10schedulableNodes\x12\'\n\x0f\x61llocatable_cpu\x18\x04 \x01(\tR\x0e\x61llocatableCpu\x12-\n\x12\x61llocatable_memory\x18\x05 \x01(\tR\x11\x61llocatableMemory\x12O\n\x0cpod_statuses\x18\x06 \x03(\x0b\x32,.iris.cluster.Controller.KubernetesPodStatusR\x0bpodStatuses\x12)\n\x10provider_version\x18\x07 \x01(\tR\x0fproviderVersion\x12\x46\n\nnode_pools\x18\x08 \x03(\x0b\x32\'.iris.cluster.Controller.NodePoolStatusR\tnodePools\x1a\x33\n\x14RestartWorkerRequest\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x1aI\n\x15RestartWorkerResponse\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08R\x08\x61\x63\x63\x65pted\x12\x14\n\x05\x65rror\x18\x02 \x01(\tR\x05\x65rror\x1a\x85\x01\n\x14SetUserBudgetRequest\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12\x31\n\x08max_band\x18\x03 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x1a\x17\n\x15SetUserBudgetResponse\x1a/\n\x14GetUserBudgetRequest\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x1a\xa9\x01\n\x15GetUserBudgetResponse\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12!\n\x0c\x62udget_spent\x18\x03 \x01(\x03R\x0b\x62udgetSpent\x12\x31\n\x08max_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x1a\x18\n\x16ListUserBudgetsRequest\x1a_\n\x17ListUserBudgetsResponse\x12\x44\n\x05users\x18\x01 \x03(\x0b\x32..iris.cluster.Controller.GetUserBudgetResponseR\x05users\x1a\x1a\n\x18GetSchedulerStateRequest\x1a\xa7\x02\n\x12SchedulerTaskEntry\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x15\n\x06job_id\x18\x02 \x01(\tR\x05jobId\x12\x17\n\x07user_id\x18\x03 \x01(\tR\x06userId\x12;\n\roriginal_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x0coriginalBand\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12%\n\x0equeue_position\x18\x06 \x01(\x05R\rqueuePosition\x12%\n\x0eresource_value\x18\x07 \x01(\x05R\rresourceValue\x1a\xa7\x01\n\x12SchedulerBandGroup\x12*\n\x04\x62\x61nd\x18\x01 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x04\x62\x61nd\x12\x41\n\x05tasks\x18\x02 \x03(\x0b\x32+.iris.cluster.Controller.SchedulerTaskEntryR\x05tasks\x12\"\n\rtotal_in_band\x18\x03 \x01(\x05R\x0btotalInBand\x1a\x97\x02\n\x13SchedulerUserBudget\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12!\n\x0c\x62udget_spent\x18\x03 \x01(\x03R\x0b\x62udgetSpent\x12\x31\n\x08max_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12/\n\x13utilization_percent\x18\x06 \x01(\x02R\x12utilizationPercent\x1a\xea\x02\n\x14SchedulerRunningTask\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x15\n\x06job_id\x18\x02 \x01(\tR\x05jobId\x12\x17\n\x07user_id\x18\x03 \x01(\tR\x06userId\x12\x1b\n\tworker_id\x18\x04 \x01(\tR\x08workerId\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12%\n\x0eresource_value\x18\x06 \x01(\x05R\rresourceValue\x12 \n\x0bpreemptible\x18\x07 \x01(\x08R\x0bpreemptible\x12=\n\x0epreemptible_by\x18\x08 \x03(\x0e\x32\x16.iris.job.PriorityBandR\rpreemptibleBy\x12%\n\x0eis_coscheduled\x18\t \x01(\x08R\risCoscheduled\x1a\xdc\x02\n\x19GetSchedulerStateResponse\x12P\n\rpending_queue\x18\x01 \x03(\x0b\x32+.iris.cluster.Controller.SchedulerBandGroupR\x0cpendingQueue\x12O\n\x0cuser_budgets\x18\x02 \x03(\x0b\x32,.iris.cluster.Controller.SchedulerUserBudgetR\x0buserBudgets\x12R\n\rrunning_tasks\x18\x03 \x03(\x0b\x32-.iris.cluster.Controller.SchedulerRunningTaskR\x0crunningTasks\x12#\n\rtotal_pending\x18\x04 \x01(\x05R\x0ctotalPending\x12#\n\rtotal_running\x18\x05 \x01(\x05R\x0ctotalRunning\"\xb7\x01\n\x0cJobSortField\x12\x1e\n\x1aJOB_SORT_FIELD_UNSPECIFIED\x10\x00\x12\x17\n\x13JOB_SORT_FIELD_DATE\x10\x01\x12\x17\n\x13JOB_SORT_FIELD_NAME\x10\x02\x12\x18\n\x14JOB_SORT_FIELD_STATE\x10\x03\x12\x1b\n\x17JOB_SORT_FIELD_FAILURES\x10\x04\x12\x1e\n\x1aJOB_SORT_FIELD_PREEMPTIONS\x10\x05\"`\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x16\n\x12SORT_DIRECTION_ASC\x10\x01\x12\x17\n\x13SORT_DIRECTION_DESC\x10\x02\"\x82\x01\n\rJobQueryScope\x12\x1f\n\x1bJOB_QUERY_SCOPE_UNSPECIFIED\x10\x00\x12\x17\n\x13JOB_QUERY_SCOPE_ALL\x10\x01\x12\x19\n\x15JOB_QUERY_SCOPE_ROOTS\x10\x02\x12\x1c\n\x18JOB_QUERY_SCOPE_CHILDREN\x10\x03\x32\x8b\x1c\n\x11\x43ontrollerService\x12\x62\n\tLaunchJob\x12).iris.cluster.Controller.LaunchJobRequest\x1a*.iris.cluster.Controller.LaunchJobResponse\x12k\n\x0cGetJobStatus\x12,.iris.cluster.Controller.GetJobStatusRequest\x1a-.iris.cluster.Controller.GetJobStatusResponse\x12h\n\x0bGetJobState\x12+.iris.cluster.Controller.GetJobStateRequest\x1a,.iris.cluster.Controller.GetJobStateResponse\x12M\n\x0cTerminateJob\x12,.iris.cluster.Controller.TerminateJobRequest\x1a\x0f.iris.job.Empty\x12_\n\x08ListJobs\x12(.iris.cluster.Controller.ListJobsRequest\x1a).iris.cluster.Controller.ListJobsResponse\x12n\n\rGetTaskStatus\x12-.iris.cluster.Controller.GetTaskStatusRequest\x1a..iris.cluster.Controller.GetTaskStatusResponse\x12\x62\n\tListTasks\x12).iris.cluster.Controller.ListTasksRequest\x1a*.iris.cluster.Controller.ListTasksResponse\x12_\n\x08Register\x12(.iris.cluster.Controller.RegisterRequest\x1a).iris.cluster.Controller.RegisterResponse\x12h\n\x0bListWorkers\x12+.iris.cluster.Controller.ListWorkersRequest\x1a,.iris.cluster.Controller.ListWorkersResponse\x12w\n\x10RegisterEndpoint\x12\x30.iris.cluster.Controller.RegisterEndpointRequest\x1a\x31.iris.cluster.Controller.RegisterEndpointResponse\x12Y\n\x12UnregisterEndpoint\x12\x32.iris.cluster.Controller.UnregisterEndpointRequest\x1a\x0f.iris.job.Empty\x12n\n\rListEndpoints\x12-.iris.cluster.Controller.ListEndpointsRequest\x1a..iris.cluster.Controller.ListEndpointsResponse\x12\x80\x01\n\x13GetAutoscalerStatus\x12\x33.iris.cluster.Controller.GetAutoscalerStatusRequest\x1a\x34.iris.cluster.Controller.GetAutoscalerStatusResponse\x12t\n\x0fGetTransactions\x12/.iris.cluster.Controller.GetTransactionsRequest\x1a\x30.iris.cluster.Controller.GetTransactionsResponse\x12\x62\n\tListUsers\x12).iris.cluster.Controller.ListUsersRequest\x1a*.iris.cluster.Controller.ListUsersResponse\x12h\n\x0bGetTaskLogs\x12+.iris.cluster.Controller.GetTaskLogsRequest\x1a,.iris.cluster.Controller.GetTaskLogsResponse\x12J\n\x0bProfileTask\x12\x1c.iris.job.ProfileTaskRequest\x1a\x1d.iris.job.ProfileTaskResponse\x12t\n\x0f\x45xecInContainer\x12/.iris.cluster.Controller.ExecInContainerRequest\x1a\x30.iris.cluster.Controller.ExecInContainerResponse\x12t\n\x0fGetWorkerStatus\x12/.iris.cluster.Controller.GetWorkerStatusRequest\x1a\x30.iris.cluster.Controller.GetWorkerStatusResponse\x12t\n\x0f\x42\x65ginCheckpoint\x12/.iris.cluster.Controller.BeginCheckpointRequest\x1a\x30.iris.cluster.Controller.BeginCheckpointResponse\x12Y\n\x10GetProcessStatus\x12!.iris.job.GetProcessStatusRequest\x1a\".iris.job.GetProcessStatusResponse\x12J\n\x0bGetAuthInfo\x12\x1c.iris.job.GetAuthInfoRequest\x1a\x1d.iris.job.GetAuthInfoResponse\x12\x38\n\x05Login\x12\x16.iris.job.LoginRequest\x1a\x17.iris.job.LoginResponse\x12M\n\x0c\x43reateApiKey\x12\x1d.iris.job.CreateApiKeyRequest\x1a\x1e.iris.job.CreateApiKeyResponse\x12>\n\x0cRevokeApiKey\x12\x1d.iris.job.RevokeApiKeyRequest\x1a\x0f.iris.job.Empty\x12J\n\x0bListApiKeys\x12\x1c.iris.job.ListApiKeysRequest\x1a\x1d.iris.job.ListApiKeysResponse\x12S\n\x0eGetCurrentUser\x12\x1f.iris.job.GetCurrentUserRequest\x1a .iris.job.GetCurrentUserResponse\x12z\n\x11GetProviderStatus\x12\x31.iris.cluster.Controller.GetProviderStatusRequest\x1a\x32.iris.cluster.Controller.GetProviderStatusResponse\x12\x95\x01\n\x1aGetKubernetesClusterStatus\x12:.iris.cluster.Controller.GetKubernetesClusterStatusRequest\x1a;.iris.cluster.Controller.GetKubernetesClusterStatusResponse\x12L\n\x0f\x45xecuteRawQuery\x12\x1b.iris.query.RawQueryRequest\x1a\x1c.iris.query.RawQueryResponse\x12n\n\rRestartWorker\x12-.iris.cluster.Controller.RestartWorkerRequest\x1a..iris.cluster.Controller.RestartWorkerResponse\x12n\n\rSetUserBudget\x12-.iris.cluster.Controller.SetUserBudgetRequest\x1a..iris.cluster.Controller.SetUserBudgetResponse\x12n\n\rGetUserBudget\x12-.iris.cluster.Controller.GetUserBudgetRequest\x1a..iris.cluster.Controller.GetUserBudgetResponse\x12t\n\x0fListUserBudgets\x12/.iris.cluster.Controller.ListUserBudgetsRequest\x1a\x30.iris.cluster.Controller.ListUserBudgetsResponse\x12z\n\x11GetSchedulerState\x12\x31.iris.cluster.Controller.GetSchedulerStateRequest\x1a\x32.iris.cluster.Controller.GetSchedulerStateResponseBt\n\x10\x63om.iris.clusterB\x0f\x43ontrollerProtoP\x01\xa2\x02\x03ICX\xaa\x02\x0cIris.Cluster\xca\x02\x0cIris\\Cluster\xe2\x02\x18Iris\\Cluster\\GPBMetadata\xea\x02\rIris::Clusterb\x08\x65\x64itionsp\xe8\x07') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10\x63ontroller.proto\x12\x0ciris.cluster\x1a\tjob.proto\x1a\rlogging.proto\x1a\x0bquery.proto\x1a\ntime.proto\x1a\x08vm.proto\"\xb8Z\n\nController\x1a\xc7\x08\n\x10LaunchJobRequest\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12;\n\nentrypoint\x18\x02 \x01(\x0b\x32\x1b.iris.job.RuntimeEntrypointR\nentrypoint\x12\x39\n\tresources\x18\x03 \x01(\x0b\x32\x1b.iris.job.ResourceSpecProtoR\tresources\x12=\n\x0b\x65nvironment\x18\x04 \x01(\x0b\x32\x1b.iris.job.EnvironmentConfigR\x0b\x65nvironment\x12\x1b\n\tbundle_id\x18\x05 \x01(\tR\x08\x62undleId\x12\x1f\n\x0b\x62undle_blob\x18\x06 \x01(\x0cR\nbundleBlob\x12\x42\n\x12scheduling_timeout\x18\x08 \x01(\x0b\x32\x13.iris.time.DurationR\x11schedulingTimeout\x12\x14\n\x05ports\x18\t \x03(\tR\x05ports\x12*\n\x11max_task_failures\x18\x0b \x01(\x05R\x0fmaxTaskFailures\x12.\n\x13max_retries_failure\x18\x0c \x01(\x05R\x11maxRetriesFailure\x12\x34\n\x16max_retries_preemption\x18\r \x01(\x05R\x14maxRetriesPreemption\x12\x36\n\x0b\x63onstraints\x18\x0e \x03(\x0b\x32\x14.iris.job.ConstraintR\x0b\x63onstraints\x12@\n\x0c\x63oscheduling\x18\x0f \x01(\x0b\x32\x1c.iris.job.CoschedulingConfigR\x0c\x63oscheduling\x12\x1a\n\x08replicas\x18\x14 \x01(\x05R\x08replicas\x12-\n\x07timeout\x18\x15 \x01(\x0b\x32\x13.iris.time.DurationR\x07timeout\x12$\n\x0e\x66\x61il_if_exists\x18\x16 \x01(\x08R\x0c\x66\x61ilIfExists\x12=\n\x0breservation\x18\x1e \x01(\x0b\x32\x1b.iris.job.ReservationConfigR\x0breservation\x12J\n\x11preemption_policy\x18\x1f \x01(\x0e\x32\x1d.iris.job.JobPreemptionPolicyR\x10preemptionPolicy\x12K\n\x13\x65xisting_job_policy\x18 \x01(\x0e\x32\x1b.iris.job.ExistingJobPolicyR\x11\x65xistingJobPolicy\x12;\n\rpriority_band\x18! \x01(\x0e\x32\x16.iris.job.PriorityBandR\x0cpriorityBand\x12\x1d\n\ntask_image\x18\" \x01(\tR\ttaskImage\x12\x1f\n\x0bsubmit_argv\x18# \x03(\tR\nsubmitArgv\x1a*\n\x11LaunchJobResponse\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a\x38\n\x13GetJobStatusRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobIdJ\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04\x1a\xfa\x01\n\x14GetJobStatusResponse\x12%\n\x03job\x18\x01 \x01(\x0b\x32\x13.iris.job.JobStatusR\x03job\x12\x43\n\x07request\x18\x02 \x01(\x0b\x32).iris.cluster.Controller.LaunchJobRequestR\x07request\x12:\n\x0cresource_min\x18\x03 \x01(\x0b\x32\x17.iris.job.ResourceUsageR\x0bresourceMin\x12:\n\x0cresource_max\x18\x04 \x01(\x0b\x32\x17.iris.job.ResourceUsageR\x0bresourceMax\x1a-\n\x12GetJobStateRequest\x12\x17\n\x07job_ids\x18\x01 \x03(\tR\x06jobIds\x1a\xb6\x01\n\x13GetJobStateResponse\x12P\n\x06states\x18\x01 \x03(\x0b\x32\x38.iris.cluster.Controller.GetJobStateResponse.StatesEntryR\x06states\x1aM\n\x0bStatesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12(\n\x05value\x18\x02 \x01(\x0e\x32\x12.iris.job.JobStateR\x05value:\x02\x38\x01\x1a,\n\x13TerminateJobRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a\xf3\x02\n\x08JobQuery\x12<\n\x05scope\x18\x01 \x01(\x0e\x32&.iris.cluster.Controller.JobQueryScopeR\x05scope\x12\"\n\rparent_job_id\x18\x02 \x01(\tR\x0bparentJobId\x12\x1f\n\x0bname_filter\x18\x03 \x01(\tR\nnameFilter\x12!\n\x0cstate_filter\x18\x04 \x01(\tR\x0bstateFilter\x12\x44\n\nsort_field\x18\x05 \x01(\x0e\x32%.iris.cluster.Controller.JobSortFieldR\tsortField\x12M\n\x0esort_direction\x18\x06 \x01(\x0e\x32&.iris.cluster.Controller.SortDirectionR\rsortDirection\x12\x16\n\x06offset\x18\x07 \x01(\x05R\x06offset\x12\x14\n\x05limit\x18\x08 \x01(\x05R\x05limit\x1a\xc9\x01\n\x0fListJobsRequest\x12\x37\n\x05query\x18\t \x01(\x0b\x32!.iris.cluster.Controller.JobQueryR\x05queryJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x07\x10\x08R\x06offsetR\x05limitR\nsort_fieldR\x0esort_directionR\x0bname_filterR\x0cstate_filterR\rparent_job_id\x1aw\n\x10ListJobsResponse\x12\'\n\x04jobs\x18\x01 \x03(\x0b\x32\x13.iris.job.JobStatusR\x04jobs\x12\x1f\n\x0btotal_count\x18\x02 \x01(\x05R\ntotalCount\x12\x19\n\x08has_more\x18\x03 \x01(\x08R\x07hasMore\x1a/\n\x14GetTaskStatusRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x1a\x83\x01\n\x15GetTaskStatusResponse\x12(\n\x04task\x18\x01 \x01(\x0b\x32\x14.iris.job.TaskStatusR\x04task\x12@\n\rjob_resources\x18\x02 \x01(\x0b\x32\x1b.iris.job.ResourceSpecProtoR\x0cjobResources\x1a)\n\x10ListTasksRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a?\n\x11ListTasksResponse\x12*\n\x05tasks\x18\x01 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x05tasks\x1at\n\x16\x45xecInContainerRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x18\n\x07\x63ommand\x18\x02 \x03(\tR\x07\x63ommand\x12\'\n\x0ftimeout_seconds\x18\x03 \x01(\x05R\x0etimeoutSeconds\x1a|\n\x17\x45xecInContainerResponse\x12\x1b\n\texit_code\x18\x01 \x01(\x05R\x08\x65xitCode\x12\x16\n\x06stdout\x18\x02 \x01(\tR\x06stdout\x12\x16\n\x06stderr\x18\x03 \x01(\tR\x06stderr\x12\x14\n\x05\x65rror\x18\x04 \x01(\tR\x05\x65rror\x1a\xb4\x01\n\nWorkerInfo\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x18\n\x07\x61\x64\x64ress\x18\x02 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x03 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12\x39\n\rregistered_at\x18\x04 \x01(\x0b\x32\x14.iris.time.TimestampR\x0cregisteredAt\x1a\xda\x02\n\x12WorkerHealthStatus\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x18\n\x07healthy\x18\x02 \x01(\x08R\x07healthy\x12\x31\n\x14\x63onsecutive_failures\x18\x03 \x01(\x05R\x13\x63onsecutiveFailures\x12;\n\x0elast_heartbeat\x18\x04 \x01(\x0b\x32\x14.iris.time.TimestampR\rlastHeartbeat\x12&\n\x0frunning_job_ids\x18\x05 \x03(\tR\rrunningJobIds\x12\x18\n\x07\x61\x64\x64ress\x18\x06 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x07 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12%\n\x0estatus_message\x18\x08 \x01(\tR\rstatusMessage\x1a\x14\n\x12ListWorkersRequest\x1a\\\n\x13ListWorkersResponse\x12\x45\n\x07workers\x18\x01 \x03(\x0b\x32+.iris.cluster.Controller.WorkerHealthStatusR\x07workers\x1a\xba\x01\n\x0fRegisterRequest\x12\x18\n\x07\x61\x64\x64ress\x18\x01 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x02 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12\x1b\n\tworker_id\x18\x03 \x01(\tR\x08workerId\x12\x19\n\x08slice_id\x18\x04 \x01(\tR\x07sliceId\x12\x1f\n\x0bscale_group\x18\x05 \x01(\tR\nscaleGroup\x1aK\n\x10RegisterResponse\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x02 \x01(\x08R\x08\x61\x63\x63\x65pted\x1a\xfc\x01\n\x08\x45ndpoint\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x18\n\x07\x61\x64\x64ress\x18\x03 \x01(\tR\x07\x61\x64\x64ress\x12\x17\n\x07task_id\x18\x04 \x01(\tR\x06taskId\x12K\n\x08metadata\x18\x05 \x03(\x0b\x32/.iris.cluster.Controller.Endpoint.MetadataEntryR\x08metadata\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\xb9\x02\n\x17RegisterEndpointRequest\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x18\n\x07\x61\x64\x64ress\x18\x02 \x01(\tR\x07\x61\x64\x64ress\x12\x17\n\x07task_id\x18\x03 \x01(\tR\x06taskId\x12Z\n\x08metadata\x18\x04 \x03(\x0b\x32>.iris.cluster.Controller.RegisterEndpointRequest.MetadataEntryR\x08metadata\x12\x1d\n\nattempt_id\x18\x05 \x01(\x05R\tattemptId\x12\x1f\n\x0b\x65ndpoint_id\x18\x06 \x01(\tR\nendpointId\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a;\n\x18RegisterEndpointResponse\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x1a<\n\x19UnregisterEndpointRequest\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x1a\x44\n\x14ListEndpointsRequest\x12\x16\n\x06prefix\x18\x01 \x01(\tR\x06prefix\x12\x14\n\x05\x65xact\x18\x02 \x01(\x08R\x05\x65xact\x1aX\n\x15ListEndpointsResponse\x12?\n\tendpoints\x18\x01 \x03(\x0b\x32!.iris.cluster.Controller.EndpointR\tendpoints\x1a\x1c\n\x1aGetAutoscalerStatusRequest\x1aP\n\x1bGetAutoscalerStatusResponse\x12\x31\n\x06status\x18\x01 \x01(\x0b\x32\x19.iris.vm.AutoscalerStatusR\x06status\x1a\x96\x01\n\x11TransactionAction\x12\x32\n\ttimestamp\x18\x01 \x01(\x0b\x32\x14.iris.time.TimestampR\ttimestamp\x12\x16\n\x06\x61\x63tion\x18\x02 \x01(\tR\x06\x61\x63tion\x12\x1b\n\tentity_id\x18\x03 \x01(\tR\x08\x65ntityId\x12\x18\n\x07\x64\x65tails\x18\x04 \x01(\tR\x07\x64\x65tails\x1a.\n\x16GetTransactionsRequest\x12\x14\n\x05limit\x18\x01 \x01(\x05R\x05limit\x1a_\n\x17GetTransactionsResponse\x12\x44\n\x07\x61\x63tions\x18\x01 \x03(\x0b\x32*.iris.cluster.Controller.TransactionActionR\x07\x61\x63tions\x1a\x18\n\x16\x42\x65ginCheckpointRequest\x1a\xd6\x01\n\x17\x42\x65ginCheckpointResponse\x12\'\n\x0f\x63heckpoint_path\x18\x01 \x01(\tR\x0e\x63heckpointPath\x12\x33\n\ncreated_at\x18\x02 \x01(\x0b\x32\x14.iris.time.TimestampR\tcreatedAt\x12\x1b\n\tjob_count\x18\x03 \x01(\x05R\x08jobCount\x12\x1d\n\ntask_count\x18\x04 \x01(\x05R\ttaskCount\x12!\n\x0cworker_count\x18\x05 \x01(\x05R\x0bworkerCount\x1a\xf3\x02\n\x0bUserSummary\x12\x12\n\x04user\x18\x01 \x01(\tR\x04user\x12\x65\n\x11task_state_counts\x18\x02 \x03(\x0b\x32\x39.iris.cluster.Controller.UserSummary.TaskStateCountsEntryR\x0ftaskStateCounts\x12\x62\n\x10job_state_counts\x18\x03 \x03(\x0b\x32\x38.iris.cluster.Controller.UserSummary.JobStateCountsEntryR\x0ejobStateCounts\x1a\x42\n\x14TaskStateCountsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x05R\x05value:\x02\x38\x01\x1a\x41\n\x13JobStateCountsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x05R\x05value:\x02\x38\x01\x1a\x12\n\x10ListUsersRequest\x1aO\n\x11ListUsersResponse\x12:\n\x05users\x18\x01 \x03(\x0b\x32$.iris.cluster.Controller.UserSummaryR\x05users\x1a\x98\x02\n\x12GetTaskLogsRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12)\n\x10include_children\x18\x02 \x01(\x08R\x0fincludeChildren\x12\x19\n\x08since_ms\x18\x03 \x01(\x03R\x07sinceMs\x12&\n\x0fmax_total_lines\x18\x04 \x01(\x03R\rmaxTotalLines\x12\x1c\n\tsubstring\x18\x05 \x01(\tR\tsubstring\x12\x1d\n\nattempt_id\x18\x06 \x01(\x05R\tattemptId\x12\x1b\n\tmin_level\x18\x07 \x01(\tR\x08minLevel\x12\x16\n\x06\x63ursor\x18\x08 \x01(\x03R\x06\x63ursor\x12\x12\n\x04tail\x18\t \x01(\x08R\x04tail\x1a\x86\x01\n\x0cTaskLogBatch\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12*\n\x04logs\x18\x02 \x03(\x0b\x32\x16.iris.logging.LogEntryR\x04logs\x12\x14\n\x05\x65rror\x18\x03 \x01(\tR\x05\x65rror\x12\x1b\n\tworker_id\x18\x04 \x01(\tR\x08workerId\x1a\xd2\x01\n\x13GetTaskLogsResponse\x12\x42\n\ttask_logs\x18\x01 \x03(\x0b\x32%.iris.cluster.Controller.TaskLogBatchR\x08taskLogs\x12\x1c\n\ttruncated\x18\x02 \x01(\x08R\ttruncated\x12\x41\n\x12\x63hild_job_statuses\x18\x03 \x03(\x0b\x32\x13.iris.job.JobStatusR\x10\x63hildJobStatuses\x12\x16\n\x06\x63ursor\x18\x04 \x01(\x03R\x06\x63ursor\x1a(\n\x16GetWorkerStatusRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x1a\xe8\x03\n\x17GetWorkerStatusResponse\x12\x1f\n\x02vm\x18\x01 \x01(\x0b\x32\x0f.iris.vm.VmInfoR\x02vm\x12\x1f\n\x0bscale_group\x18\x02 \x01(\tR\nscaleGroup\x12\x43\n\x06worker\x18\x03 \x01(\x0b\x32+.iris.cluster.Controller.WorkerHealthStatusR\x06worker\x12%\n\x0e\x62ootstrap_logs\x18\x04 \x01(\tR\rbootstrapLogs\x12\x44\n\x12worker_log_entries\x18\t \x03(\x0b\x32\x16.iris.logging.LogEntryR\x10workerLogEntries\x12\x37\n\x0crecent_tasks\x18\x06 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x0brecentTasks\x12M\n\x11\x63urrent_resources\x18\x07 \x01(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x10\x63urrentResources\x12K\n\x10resource_history\x18\x08 \x03(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x0fresourceHistoryJ\x04\x08\x05\x10\x06\x1a\xce\x01\n\x0fSchedulingEvent\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x1d\n\nattempt_id\x18\x02 \x01(\x05R\tattemptId\x12\x1d\n\nevent_type\x18\x03 \x01(\tR\teventType\x12\x16\n\x06reason\x18\x04 \x01(\tR\x06reason\x12\x18\n\x07message\x18\x05 \x01(\tR\x07message\x12\x32\n\ttimestamp\x18\x06 \x01(\x0b\x32\x14.iris.time.TimestampR\ttimestamp\x1a\x8e\x02\n\x0f\x43lusterCapacity\x12+\n\x11schedulable_nodes\x18\x01 \x01(\x05R\x10schedulableNodes\x12\x30\n\x14total_cpu_millicores\x18\x02 \x01(\x03R\x12totalCpuMillicores\x12\x38\n\x18\x61vailable_cpu_millicores\x18\x03 \x01(\x03R\x16\x61vailableCpuMillicores\x12,\n\x12total_memory_bytes\x18\x04 \x01(\x03R\x10totalMemoryBytes\x12\x34\n\x16\x61vailable_memory_bytes\x18\x05 \x01(\x03R\x14\x61vailableMemoryBytes\x1a\x1a\n\x18GetProviderStatusRequest\x1a\xe8\x01\n\x19GetProviderStatusResponse\x12.\n\x13has_direct_provider\x18\x01 \x01(\x08R\x11hasDirectProvider\x12U\n\x11scheduling_events\x18\x02 \x03(\x0b\x32(.iris.cluster.Controller.SchedulingEventR\x10schedulingEvents\x12\x44\n\x08\x63\x61pacity\x18\x03 \x01(\x0b\x32(.iris.cluster.Controller.ClusterCapacityR\x08\x63\x61pacity\x1a#\n!GetKubernetesClusterStatusRequest\x1a\xed\x01\n\x13KubernetesPodStatus\x12\x19\n\x08pod_name\x18\x01 \x01(\tR\x07podName\x12\x17\n\x07task_id\x18\x02 \x01(\tR\x06taskId\x12\x14\n\x05phase\x18\x03 \x01(\tR\x05phase\x12\x16\n\x06reason\x18\x04 \x01(\tR\x06reason\x12\x18\n\x07message\x18\x05 \x01(\tR\x07message\x12=\n\x0flast_transition\x18\x06 \x01(\x0b\x32\x14.iris.time.TimestampR\x0elastTransition\x12\x1b\n\tnode_name\x18\x07 \x01(\tR\x08nodeName\x1a\x8f\x03\n\x0eNodePoolStatus\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12#\n\rinstance_type\x18\x02 \x01(\tR\x0cinstanceType\x12\x1f\n\x0bscale_group\x18\x03 \x01(\tR\nscaleGroup\x12!\n\x0ctarget_nodes\x18\x04 \x01(\x05R\x0btargetNodes\x12#\n\rcurrent_nodes\x18\x05 \x01(\x05R\x0c\x63urrentNodes\x12!\n\x0cqueued_nodes\x18\x06 \x01(\x05R\x0bqueuedNodes\x12*\n\x11in_progress_nodes\x18\x07 \x01(\x05R\x0finProgressNodes\x12 \n\x0b\x61utoscaling\x18\x08 \x01(\x08R\x0b\x61utoscaling\x12\x1b\n\tmin_nodes\x18\t \x01(\x05R\x08minNodes\x12\x1b\n\tmax_nodes\x18\n \x01(\x05R\x08maxNodes\x12\x1a\n\x08\x63\x61pacity\x18\x0b \x01(\tR\x08\x63\x61pacity\x12\x14\n\x05quota\x18\x0c \x01(\tR\x05quota\x1a\xac\x03\n\"GetKubernetesClusterStatusResponse\x12\x1c\n\tnamespace\x18\x01 \x01(\tR\tnamespace\x12\x1f\n\x0btotal_nodes\x18\x02 \x01(\x05R\ntotalNodes\x12+\n\x11schedulable_nodes\x18\x03 \x01(\x05R\x10schedulableNodes\x12\'\n\x0f\x61llocatable_cpu\x18\x04 \x01(\tR\x0e\x61llocatableCpu\x12-\n\x12\x61llocatable_memory\x18\x05 \x01(\tR\x11\x61llocatableMemory\x12O\n\x0cpod_statuses\x18\x06 \x03(\x0b\x32,.iris.cluster.Controller.KubernetesPodStatusR\x0bpodStatuses\x12)\n\x10provider_version\x18\x07 \x01(\tR\x0fproviderVersion\x12\x46\n\nnode_pools\x18\x08 \x03(\x0b\x32\'.iris.cluster.Controller.NodePoolStatusR\tnodePools\x1a\x33\n\x14RestartWorkerRequest\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x1aI\n\x15RestartWorkerResponse\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08R\x08\x61\x63\x63\x65pted\x12\x14\n\x05\x65rror\x18\x02 \x01(\tR\x05\x65rror\x1a\x85\x01\n\x14SetUserBudgetRequest\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12\x31\n\x08max_band\x18\x03 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x1a\x17\n\x15SetUserBudgetResponse\x1a/\n\x14GetUserBudgetRequest\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x1a\xa9\x01\n\x15GetUserBudgetResponse\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12!\n\x0c\x62udget_spent\x18\x03 \x01(\x03R\x0b\x62udgetSpent\x12\x31\n\x08max_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x1a\x18\n\x16ListUserBudgetsRequest\x1a_\n\x17ListUserBudgetsResponse\x12\x44\n\x05users\x18\x01 \x03(\x0b\x32..iris.cluster.Controller.GetUserBudgetResponseR\x05users\x1al\n\x17UpdateTaskStatusRequest\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x34\n\x07updates\x18\x02 \x03(\x0b\x32\x1a.iris.job.WorkerTaskStatusR\x07updates\x1a\x1a\n\x18UpdateTaskStatusResponse\x1a\x1a\n\x18GetSchedulerStateRequest\x1a\xa7\x02\n\x12SchedulerTaskEntry\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x15\n\x06job_id\x18\x02 \x01(\tR\x05jobId\x12\x17\n\x07user_id\x18\x03 \x01(\tR\x06userId\x12;\n\roriginal_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x0coriginalBand\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12%\n\x0equeue_position\x18\x06 \x01(\x05R\rqueuePosition\x12%\n\x0eresource_value\x18\x07 \x01(\x05R\rresourceValue\x1a\xa7\x01\n\x12SchedulerBandGroup\x12*\n\x04\x62\x61nd\x18\x01 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x04\x62\x61nd\x12\x41\n\x05tasks\x18\x02 \x03(\x0b\x32+.iris.cluster.Controller.SchedulerTaskEntryR\x05tasks\x12\"\n\rtotal_in_band\x18\x03 \x01(\x05R\x0btotalInBand\x1a\x97\x02\n\x13SchedulerUserBudget\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12!\n\x0c\x62udget_spent\x18\x03 \x01(\x03R\x0b\x62udgetSpent\x12\x31\n\x08max_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12/\n\x13utilization_percent\x18\x06 \x01(\x02R\x12utilizationPercent\x1a\xea\x02\n\x14SchedulerRunningTask\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x15\n\x06job_id\x18\x02 \x01(\tR\x05jobId\x12\x17\n\x07user_id\x18\x03 \x01(\tR\x06userId\x12\x1b\n\tworker_id\x18\x04 \x01(\tR\x08workerId\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12%\n\x0eresource_value\x18\x06 \x01(\x05R\rresourceValue\x12 \n\x0bpreemptible\x18\x07 \x01(\x08R\x0bpreemptible\x12=\n\x0epreemptible_by\x18\x08 \x03(\x0e\x32\x16.iris.job.PriorityBandR\rpreemptibleBy\x12%\n\x0eis_coscheduled\x18\t \x01(\x08R\risCoscheduled\x1a\xdc\x02\n\x19GetSchedulerStateResponse\x12P\n\rpending_queue\x18\x01 \x03(\x0b\x32+.iris.cluster.Controller.SchedulerBandGroupR\x0cpendingQueue\x12O\n\x0cuser_budgets\x18\x02 \x03(\x0b\x32,.iris.cluster.Controller.SchedulerUserBudgetR\x0buserBudgets\x12R\n\rrunning_tasks\x18\x03 \x03(\x0b\x32-.iris.cluster.Controller.SchedulerRunningTaskR\x0crunningTasks\x12#\n\rtotal_pending\x18\x04 \x01(\x05R\x0ctotalPending\x12#\n\rtotal_running\x18\x05 \x01(\x05R\x0ctotalRunning\"\xb7\x01\n\x0cJobSortField\x12\x1e\n\x1aJOB_SORT_FIELD_UNSPECIFIED\x10\x00\x12\x17\n\x13JOB_SORT_FIELD_DATE\x10\x01\x12\x17\n\x13JOB_SORT_FIELD_NAME\x10\x02\x12\x18\n\x14JOB_SORT_FIELD_STATE\x10\x03\x12\x1b\n\x17JOB_SORT_FIELD_FAILURES\x10\x04\x12\x1e\n\x1aJOB_SORT_FIELD_PREEMPTIONS\x10\x05\"`\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x16\n\x12SORT_DIRECTION_ASC\x10\x01\x12\x17\n\x13SORT_DIRECTION_DESC\x10\x02\"\x82\x01\n\rJobQueryScope\x12\x1f\n\x1bJOB_QUERY_SCOPE_UNSPECIFIED\x10\x00\x12\x17\n\x13JOB_QUERY_SCOPE_ALL\x10\x01\x12\x19\n\x15JOB_QUERY_SCOPE_ROOTS\x10\x02\x12\x1c\n\x18JOB_QUERY_SCOPE_CHILDREN\x10\x03\x32\x84\x1d\n\x11\x43ontrollerService\x12\x62\n\tLaunchJob\x12).iris.cluster.Controller.LaunchJobRequest\x1a*.iris.cluster.Controller.LaunchJobResponse\x12k\n\x0cGetJobStatus\x12,.iris.cluster.Controller.GetJobStatusRequest\x1a-.iris.cluster.Controller.GetJobStatusResponse\x12h\n\x0bGetJobState\x12+.iris.cluster.Controller.GetJobStateRequest\x1a,.iris.cluster.Controller.GetJobStateResponse\x12M\n\x0cTerminateJob\x12,.iris.cluster.Controller.TerminateJobRequest\x1a\x0f.iris.job.Empty\x12_\n\x08ListJobs\x12(.iris.cluster.Controller.ListJobsRequest\x1a).iris.cluster.Controller.ListJobsResponse\x12n\n\rGetTaskStatus\x12-.iris.cluster.Controller.GetTaskStatusRequest\x1a..iris.cluster.Controller.GetTaskStatusResponse\x12\x62\n\tListTasks\x12).iris.cluster.Controller.ListTasksRequest\x1a*.iris.cluster.Controller.ListTasksResponse\x12_\n\x08Register\x12(.iris.cluster.Controller.RegisterRequest\x1a).iris.cluster.Controller.RegisterResponse\x12h\n\x0bListWorkers\x12+.iris.cluster.Controller.ListWorkersRequest\x1a,.iris.cluster.Controller.ListWorkersResponse\x12w\n\x10RegisterEndpoint\x12\x30.iris.cluster.Controller.RegisterEndpointRequest\x1a\x31.iris.cluster.Controller.RegisterEndpointResponse\x12Y\n\x12UnregisterEndpoint\x12\x32.iris.cluster.Controller.UnregisterEndpointRequest\x1a\x0f.iris.job.Empty\x12n\n\rListEndpoints\x12-.iris.cluster.Controller.ListEndpointsRequest\x1a..iris.cluster.Controller.ListEndpointsResponse\x12\x80\x01\n\x13GetAutoscalerStatus\x12\x33.iris.cluster.Controller.GetAutoscalerStatusRequest\x1a\x34.iris.cluster.Controller.GetAutoscalerStatusResponse\x12t\n\x0fGetTransactions\x12/.iris.cluster.Controller.GetTransactionsRequest\x1a\x30.iris.cluster.Controller.GetTransactionsResponse\x12\x62\n\tListUsers\x12).iris.cluster.Controller.ListUsersRequest\x1a*.iris.cluster.Controller.ListUsersResponse\x12h\n\x0bGetTaskLogs\x12+.iris.cluster.Controller.GetTaskLogsRequest\x1a,.iris.cluster.Controller.GetTaskLogsResponse\x12J\n\x0bProfileTask\x12\x1c.iris.job.ProfileTaskRequest\x1a\x1d.iris.job.ProfileTaskResponse\x12t\n\x0f\x45xecInContainer\x12/.iris.cluster.Controller.ExecInContainerRequest\x1a\x30.iris.cluster.Controller.ExecInContainerResponse\x12t\n\x0fGetWorkerStatus\x12/.iris.cluster.Controller.GetWorkerStatusRequest\x1a\x30.iris.cluster.Controller.GetWorkerStatusResponse\x12t\n\x0f\x42\x65ginCheckpoint\x12/.iris.cluster.Controller.BeginCheckpointRequest\x1a\x30.iris.cluster.Controller.BeginCheckpointResponse\x12Y\n\x10GetProcessStatus\x12!.iris.job.GetProcessStatusRequest\x1a\".iris.job.GetProcessStatusResponse\x12J\n\x0bGetAuthInfo\x12\x1c.iris.job.GetAuthInfoRequest\x1a\x1d.iris.job.GetAuthInfoResponse\x12\x38\n\x05Login\x12\x16.iris.job.LoginRequest\x1a\x17.iris.job.LoginResponse\x12M\n\x0c\x43reateApiKey\x12\x1d.iris.job.CreateApiKeyRequest\x1a\x1e.iris.job.CreateApiKeyResponse\x12>\n\x0cRevokeApiKey\x12\x1d.iris.job.RevokeApiKeyRequest\x1a\x0f.iris.job.Empty\x12J\n\x0bListApiKeys\x12\x1c.iris.job.ListApiKeysRequest\x1a\x1d.iris.job.ListApiKeysResponse\x12S\n\x0eGetCurrentUser\x12\x1f.iris.job.GetCurrentUserRequest\x1a .iris.job.GetCurrentUserResponse\x12z\n\x11GetProviderStatus\x12\x31.iris.cluster.Controller.GetProviderStatusRequest\x1a\x32.iris.cluster.Controller.GetProviderStatusResponse\x12\x95\x01\n\x1aGetKubernetesClusterStatus\x12:.iris.cluster.Controller.GetKubernetesClusterStatusRequest\x1a;.iris.cluster.Controller.GetKubernetesClusterStatusResponse\x12L\n\x0f\x45xecuteRawQuery\x12\x1b.iris.query.RawQueryRequest\x1a\x1c.iris.query.RawQueryResponse\x12n\n\rRestartWorker\x12-.iris.cluster.Controller.RestartWorkerRequest\x1a..iris.cluster.Controller.RestartWorkerResponse\x12n\n\rSetUserBudget\x12-.iris.cluster.Controller.SetUserBudgetRequest\x1a..iris.cluster.Controller.SetUserBudgetResponse\x12n\n\rGetUserBudget\x12-.iris.cluster.Controller.GetUserBudgetRequest\x1a..iris.cluster.Controller.GetUserBudgetResponse\x12t\n\x0fListUserBudgets\x12/.iris.cluster.Controller.ListUserBudgetsRequest\x1a\x30.iris.cluster.Controller.ListUserBudgetsResponse\x12z\n\x11GetSchedulerState\x12\x31.iris.cluster.Controller.GetSchedulerStateRequest\x1a\x32.iris.cluster.Controller.GetSchedulerStateResponse\x12w\n\x10UpdateTaskStatus\x12\x30.iris.cluster.Controller.UpdateTaskStatusRequest\x1a\x31.iris.cluster.Controller.UpdateTaskStatusResponseBt\n\x10\x63om.iris.clusterB\x0f\x43ontrollerProtoP\x01\xa2\x02\x03ICX\xaa\x02\x0cIris.Cluster\xca\x02\x0cIris\\Cluster\xe2\x02\x18Iris\\Cluster\\GPBMetadata\xea\x02\rIris::Clusterb\x08\x65\x64itionsp\xe8\x07') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -48,7 +48,7 @@ _globals['_CONTROLLER_USERSUMMARY_JOBSTATECOUNTSENTRY']._loaded_options = None _globals['_CONTROLLER_USERSUMMARY_JOBSTATECOUNTSENTRY']._serialized_options = b'8\001' _globals['_CONTROLLER']._serialized_start=96 - _globals['_CONTROLLER']._serialized_end=11534 + _globals['_CONTROLLER']._serialized_end=11672 _globals['_CONTROLLER_LAUNCHJOBREQUEST']._serialized_start=111 _globals['_CONTROLLER_LAUNCHJOBREQUEST']._serialized_end=1206 _globals['_CONTROLLER_LAUNCHJOBRESPONSE']._serialized_start=1208 @@ -177,24 +177,28 @@ _globals['_CONTROLLER_LISTUSERBUDGETSREQUEST']._serialized_end=9526 _globals['_CONTROLLER_LISTUSERBUDGETSRESPONSE']._serialized_start=9528 _globals['_CONTROLLER_LISTUSERBUDGETSRESPONSE']._serialized_end=9623 - _globals['_CONTROLLER_GETSCHEDULERSTATEREQUEST']._serialized_start=9625 - _globals['_CONTROLLER_GETSCHEDULERSTATEREQUEST']._serialized_end=9651 - _globals['_CONTROLLER_SCHEDULERTASKENTRY']._serialized_start=9654 - _globals['_CONTROLLER_SCHEDULERTASKENTRY']._serialized_end=9949 - _globals['_CONTROLLER_SCHEDULERBANDGROUP']._serialized_start=9952 - _globals['_CONTROLLER_SCHEDULERBANDGROUP']._serialized_end=10119 - _globals['_CONTROLLER_SCHEDULERUSERBUDGET']._serialized_start=10122 - _globals['_CONTROLLER_SCHEDULERUSERBUDGET']._serialized_end=10401 - _globals['_CONTROLLER_SCHEDULERRUNNINGTASK']._serialized_start=10404 - _globals['_CONTROLLER_SCHEDULERRUNNINGTASK']._serialized_end=10766 - _globals['_CONTROLLER_GETSCHEDULERSTATERESPONSE']._serialized_start=10769 - _globals['_CONTROLLER_GETSCHEDULERSTATERESPONSE']._serialized_end=11117 - _globals['_CONTROLLER_JOBSORTFIELD']._serialized_start=11120 - _globals['_CONTROLLER_JOBSORTFIELD']._serialized_end=11303 - _globals['_CONTROLLER_SORTDIRECTION']._serialized_start=11305 - _globals['_CONTROLLER_SORTDIRECTION']._serialized_end=11401 - _globals['_CONTROLLER_JOBQUERYSCOPE']._serialized_start=11404 - _globals['_CONTROLLER_JOBQUERYSCOPE']._serialized_end=11534 - _globals['_CONTROLLERSERVICE']._serialized_start=11537 - _globals['_CONTROLLERSERVICE']._serialized_end=15132 + _globals['_CONTROLLER_UPDATETASKSTATUSREQUEST']._serialized_start=9625 + _globals['_CONTROLLER_UPDATETASKSTATUSREQUEST']._serialized_end=9733 + _globals['_CONTROLLER_UPDATETASKSTATUSRESPONSE']._serialized_start=9735 + _globals['_CONTROLLER_UPDATETASKSTATUSRESPONSE']._serialized_end=9761 + _globals['_CONTROLLER_GETSCHEDULERSTATEREQUEST']._serialized_start=9763 + _globals['_CONTROLLER_GETSCHEDULERSTATEREQUEST']._serialized_end=9789 + _globals['_CONTROLLER_SCHEDULERTASKENTRY']._serialized_start=9792 + _globals['_CONTROLLER_SCHEDULERTASKENTRY']._serialized_end=10087 + _globals['_CONTROLLER_SCHEDULERBANDGROUP']._serialized_start=10090 + _globals['_CONTROLLER_SCHEDULERBANDGROUP']._serialized_end=10257 + _globals['_CONTROLLER_SCHEDULERUSERBUDGET']._serialized_start=10260 + _globals['_CONTROLLER_SCHEDULERUSERBUDGET']._serialized_end=10539 + _globals['_CONTROLLER_SCHEDULERRUNNINGTASK']._serialized_start=10542 + _globals['_CONTROLLER_SCHEDULERRUNNINGTASK']._serialized_end=10904 + _globals['_CONTROLLER_GETSCHEDULERSTATERESPONSE']._serialized_start=10907 + _globals['_CONTROLLER_GETSCHEDULERSTATERESPONSE']._serialized_end=11255 + _globals['_CONTROLLER_JOBSORTFIELD']._serialized_start=11258 + _globals['_CONTROLLER_JOBSORTFIELD']._serialized_end=11441 + _globals['_CONTROLLER_SORTDIRECTION']._serialized_start=11443 + _globals['_CONTROLLER_SORTDIRECTION']._serialized_end=11539 + _globals['_CONTROLLER_JOBQUERYSCOPE']._serialized_start=11542 + _globals['_CONTROLLER_JOBQUERYSCOPE']._serialized_end=11672 + _globals['_CONTROLLERSERVICE']._serialized_start=11675 + _globals['_CONTROLLERSERVICE']._serialized_end=15391 # @@protoc_insertion_point(module_scope) diff --git a/lib/iris/src/iris/rpc/controller_pb2.pyi b/lib/iris/src/iris/rpc/controller_pb2.pyi index c9a978da90..044157fea1 100644 --- a/lib/iris/src/iris/rpc/controller_pb2.pyi +++ b/lib/iris/src/iris/rpc/controller_pb2.pyi @@ -630,6 +630,16 @@ class Controller(_message.Message): USERS_FIELD_NUMBER: _ClassVar[int] users: _containers.RepeatedCompositeFieldContainer[Controller.GetUserBudgetResponse] def __init__(self, users: _Optional[_Iterable[_Union[Controller.GetUserBudgetResponse, _Mapping]]] = ...) -> None: ... + class UpdateTaskStatusRequest(_message.Message): + __slots__ = ("worker_id", "updates") + WORKER_ID_FIELD_NUMBER: _ClassVar[int] + UPDATES_FIELD_NUMBER: _ClassVar[int] + worker_id: str + updates: _containers.RepeatedCompositeFieldContainer[_job_pb2.WorkerTaskStatus] + def __init__(self, worker_id: _Optional[str] = ..., updates: _Optional[_Iterable[_Union[_job_pb2.WorkerTaskStatus, _Mapping]]] = ...) -> None: ... + class UpdateTaskStatusResponse(_message.Message): + __slots__ = () + def __init__(self) -> None: ... class GetSchedulerStateRequest(_message.Message): __slots__ = () def __init__(self) -> None: ... diff --git a/lib/iris/src/iris/rpc/worker.proto b/lib/iris/src/iris/rpc/worker.proto index e439890d32..7e22a140f9 100644 --- a/lib/iris/src/iris/rpc/worker.proto +++ b/lib/iris/src/iris/rpc/worker.proto @@ -65,6 +65,39 @@ message Worker { string stderr = 3; string error = 4; // Non-empty if exec setup failed } + + // --- Heartbeat Refactor: New focused RPCs --- + + message PingRequest {} + message PingResponse { + iris.job.WorkerResourceSnapshot resource_snapshot = 1; + bool healthy = 2; + string health_error = 3; + } + + message StartTasksRequest { + repeated iris.job.RunTaskRequest tasks = 1; + } + message StartTasksResponse { + repeated TaskAck acks = 1; + } + message TaskAck { + string task_id = 1; + bool accepted = 2; + string error = 3; + } + + message StopTasksRequest { + repeated string task_ids = 1; + } + message StopTasksResponse {} + + message PollTasksRequest { + repeated iris.job.WorkerTaskStatus expected_tasks = 1; + } + message PollTasksResponse { + repeated iris.job.WorkerTaskStatus tasks = 1; + } } // ============================================================================ @@ -83,4 +116,10 @@ service WorkerService { // Exec in container rpc ExecInContainer(Worker.ExecInContainerRequest) returns (Worker.ExecInContainerResponse); + + // --- Heartbeat Refactor: New focused RPCs --- + rpc Ping(Worker.PingRequest) returns (Worker.PingResponse); + rpc StartTasks(Worker.StartTasksRequest) returns (Worker.StartTasksResponse); + rpc StopTasks(Worker.StopTasksRequest) returns (Worker.StopTasksResponse); + rpc PollTasks(Worker.PollTasksRequest) returns (Worker.PollTasksResponse); } diff --git a/lib/iris/src/iris/rpc/worker_connect.py b/lib/iris/src/iris/rpc/worker_connect.py index a32d0c30ab..390d4dfaf1 100644 --- a/lib/iris/src/iris/rpc/worker_connect.py +++ b/lib/iris/src/iris/rpc/worker_connect.py @@ -39,6 +39,18 @@ async def get_process_status(self, request: job__pb2.GetProcessStatusRequest, ct async def exec_in_container(self, request: worker__pb2.Worker.ExecInContainerRequest, ctx: RequestContext) -> worker__pb2.Worker.ExecInContainerResponse: raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + async def ping(self, request: worker__pb2.Worker.PingRequest, ctx: RequestContext) -> worker__pb2.Worker.PingResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + + async def start_tasks(self, request: worker__pb2.Worker.StartTasksRequest, ctx: RequestContext) -> worker__pb2.Worker.StartTasksResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + + async def stop_tasks(self, request: worker__pb2.Worker.StopTasksRequest, ctx: RequestContext) -> worker__pb2.Worker.StopTasksResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + + async def poll_tasks(self, request: worker__pb2.Worker.PollTasksRequest, ctx: RequestContext) -> worker__pb2.Worker.PollTasksResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + class WorkerServiceASGIApplication(ConnectASGIApplication[WorkerService]): def __init__(self, service: WorkerService | AsyncGenerator[WorkerService], *, interceptors: Iterable[Interceptor]=(), read_max_bytes: int | None = None, compressions: Iterable[Compression] | None = None) -> None: @@ -115,6 +127,46 @@ def __init__(self, service: WorkerService | AsyncGenerator[WorkerService], *, in ), function=svc.exec_in_container, ), + "/iris.cluster.WorkerService/Ping": Endpoint.unary( + method=MethodInfo( + name="Ping", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.PingRequest, + output=worker__pb2.Worker.PingResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=svc.ping, + ), + "/iris.cluster.WorkerService/StartTasks": Endpoint.unary( + method=MethodInfo( + name="StartTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.StartTasksRequest, + output=worker__pb2.Worker.StartTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=svc.start_tasks, + ), + "/iris.cluster.WorkerService/StopTasks": Endpoint.unary( + method=MethodInfo( + name="StopTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.StopTasksRequest, + output=worker__pb2.Worker.StopTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=svc.stop_tasks, + ), + "/iris.cluster.WorkerService/PollTasks": Endpoint.unary( + method=MethodInfo( + name="PollTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.PollTasksRequest, + output=worker__pb2.Worker.PollTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=svc.poll_tasks, + ), }, interceptors=interceptors, read_max_bytes=read_max_bytes, @@ -268,6 +320,86 @@ async def exec_in_container( timeout_ms=timeout_ms, ) + async def ping( + self, + request: worker__pb2.Worker.PingRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> worker__pb2.Worker.PingResponse: + return await self.execute_unary( + request=request, + method=MethodInfo( + name="Ping", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.PingRequest, + output=worker__pb2.Worker.PingResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) + + async def start_tasks( + self, + request: worker__pb2.Worker.StartTasksRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> worker__pb2.Worker.StartTasksResponse: + return await self.execute_unary( + request=request, + method=MethodInfo( + name="StartTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.StartTasksRequest, + output=worker__pb2.Worker.StartTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) + + async def stop_tasks( + self, + request: worker__pb2.Worker.StopTasksRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> worker__pb2.Worker.StopTasksResponse: + return await self.execute_unary( + request=request, + method=MethodInfo( + name="StopTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.StopTasksRequest, + output=worker__pb2.Worker.StopTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) + + async def poll_tasks( + self, + request: worker__pb2.Worker.PollTasksRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> worker__pb2.Worker.PollTasksResponse: + return await self.execute_unary( + request=request, + method=MethodInfo( + name="PollTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.PollTasksRequest, + output=worker__pb2.Worker.PollTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) + class WorkerServiceSync(Protocol): def get_task_status(self, request: worker__pb2.Worker.GetTaskStatusRequest, ctx: RequestContext) -> job__pb2.TaskStatus: @@ -284,6 +416,14 @@ def get_process_status(self, request: job__pb2.GetProcessStatusRequest, ctx: Req raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") def exec_in_container(self, request: worker__pb2.Worker.ExecInContainerRequest, ctx: RequestContext) -> worker__pb2.Worker.ExecInContainerResponse: raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + def ping(self, request: worker__pb2.Worker.PingRequest, ctx: RequestContext) -> worker__pb2.Worker.PingResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + def start_tasks(self, request: worker__pb2.Worker.StartTasksRequest, ctx: RequestContext) -> worker__pb2.Worker.StartTasksResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + def stop_tasks(self, request: worker__pb2.Worker.StopTasksRequest, ctx: RequestContext) -> worker__pb2.Worker.StopTasksResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") + def poll_tasks(self, request: worker__pb2.Worker.PollTasksRequest, ctx: RequestContext) -> worker__pb2.Worker.PollTasksResponse: + raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") class WorkerServiceWSGIApplication(ConnectWSGIApplication): @@ -360,6 +500,46 @@ def __init__(self, service: WorkerServiceSync, interceptors: Iterable[Intercepto ), function=service.exec_in_container, ), + "/iris.cluster.WorkerService/Ping": EndpointSync.unary( + method=MethodInfo( + name="Ping", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.PingRequest, + output=worker__pb2.Worker.PingResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=service.ping, + ), + "/iris.cluster.WorkerService/StartTasks": EndpointSync.unary( + method=MethodInfo( + name="StartTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.StartTasksRequest, + output=worker__pb2.Worker.StartTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=service.start_tasks, + ), + "/iris.cluster.WorkerService/StopTasks": EndpointSync.unary( + method=MethodInfo( + name="StopTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.StopTasksRequest, + output=worker__pb2.Worker.StopTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=service.stop_tasks, + ), + "/iris.cluster.WorkerService/PollTasks": EndpointSync.unary( + method=MethodInfo( + name="PollTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.PollTasksRequest, + output=worker__pb2.Worker.PollTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + function=service.poll_tasks, + ), }, interceptors=interceptors, read_max_bytes=read_max_bytes, @@ -512,3 +692,83 @@ def exec_in_container( headers=headers, timeout_ms=timeout_ms, ) + + def ping( + self, + request: worker__pb2.Worker.PingRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> worker__pb2.Worker.PingResponse: + return self.execute_unary( + request=request, + method=MethodInfo( + name="Ping", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.PingRequest, + output=worker__pb2.Worker.PingResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) + + def start_tasks( + self, + request: worker__pb2.Worker.StartTasksRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> worker__pb2.Worker.StartTasksResponse: + return self.execute_unary( + request=request, + method=MethodInfo( + name="StartTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.StartTasksRequest, + output=worker__pb2.Worker.StartTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) + + def stop_tasks( + self, + request: worker__pb2.Worker.StopTasksRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> worker__pb2.Worker.StopTasksResponse: + return self.execute_unary( + request=request, + method=MethodInfo( + name="StopTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.StopTasksRequest, + output=worker__pb2.Worker.StopTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) + + def poll_tasks( + self, + request: worker__pb2.Worker.PollTasksRequest, + *, + headers: Headers | Mapping[str, str] | None = None, + timeout_ms: int | None = None, + ) -> worker__pb2.Worker.PollTasksResponse: + return self.execute_unary( + request=request, + method=MethodInfo( + name="PollTasks", + service_name="iris.cluster.WorkerService", + input=worker__pb2.Worker.PollTasksRequest, + output=worker__pb2.Worker.PollTasksResponse, + idempotency_level=IdempotencyLevel.UNKNOWN, + ), + headers=headers, + timeout_ms=timeout_ms, + ) diff --git a/lib/iris/src/iris/rpc/worker_pb2.py b/lib/iris/src/iris/rpc/worker_pb2.py index 0791f034f2..be7d86945d 100644 --- a/lib/iris/src/iris/rpc/worker_pb2.py +++ b/lib/iris/src/iris/rpc/worker_pb2.py @@ -26,7 +26,7 @@ from . import time_pb2 as time__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0cworker.proto\x12\x0ciris.cluster\x1a\tjob.proto\x1a\ntime.proto\"\xea\x04\n\x06Worker\x1a\x35\n\x14GetTaskStatusRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskIdJ\x04\x08\x02\x10\x03\x1a\x12\n\x10ListTasksRequest\x1a?\n\x11ListTasksResponse\x12*\n\x05tasks\x18\x01 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x05tasks\x1a\x62\n\x0fKillTaskRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x36\n\x0cterm_timeout\x18\x02 \x01(\x0b\x32\x13.iris.time.DurationR\x0btermTimeout\x1a|\n\x0eHealthResponse\x12\x18\n\x07healthy\x18\x01 \x01(\x08R\x07healthy\x12+\n\x06uptime\x18\x02 \x01(\x0b\x32\x13.iris.time.DurationR\x06uptime\x12#\n\rrunning_tasks\x18\x03 \x01(\x05R\x0crunningTasks\x1at\n\x16\x45xecInContainerRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x18\n\x07\x63ommand\x18\x02 \x03(\tR\x07\x63ommand\x12\'\n\x0ftimeout_seconds\x18\x03 \x01(\x05R\x0etimeoutSeconds\x1a|\n\x17\x45xecInContainerResponse\x12\x1b\n\texit_code\x18\x01 \x01(\x05R\x08\x65xitCode\x12\x16\n\x06stdout\x18\x02 \x01(\tR\x06stdout\x12\x16\n\x06stderr\x18\x03 \x01(\tR\x06stderr\x12\x14\n\x05\x65rror\x18\x04 \x01(\tR\x05\x65rror2\xdd\x04\n\rWorkerService\x12P\n\rGetTaskStatus\x12).iris.cluster.Worker.GetTaskStatusRequest\x1a\x14.iris.job.TaskStatus\x12Z\n\tListTasks\x12%.iris.cluster.Worker.ListTasksRequest\x1a&.iris.cluster.Worker.ListTasksResponse\x12\x43\n\x0bHealthCheck\x12\x0f.iris.job.Empty\x1a#.iris.cluster.Worker.HealthResponse\x12\x44\n\tHeartbeat\x12\x1a.iris.job.HeartbeatRequest\x1a\x1b.iris.job.HeartbeatResponse\x12J\n\x0bProfileTask\x12\x1c.iris.job.ProfileTaskRequest\x1a\x1d.iris.job.ProfileTaskResponse\x12Y\n\x10GetProcessStatus\x12!.iris.job.GetProcessStatusRequest\x1a\".iris.job.GetProcessStatusResponse\x12l\n\x0f\x45xecInContainer\x12+.iris.cluster.Worker.ExecInContainerRequest\x1a,.iris.cluster.Worker.ExecInContainerResponseBp\n\x10\x63om.iris.clusterB\x0bWorkerProtoP\x01\xa2\x02\x03ICX\xaa\x02\x0cIris.Cluster\xca\x02\x0cIris\\Cluster\xe2\x02\x18Iris\\Cluster\\GPBMetadata\xea\x02\rIris::Clusterb\x08\x65\x64itionsp\xe8\x07') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0cworker.proto\x12\x0ciris.cluster\x1a\tjob.proto\x1a\ntime.proto\"\xdb\t\n\x06Worker\x1a\x35\n\x14GetTaskStatusRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskIdJ\x04\x08\x02\x10\x03\x1a\x12\n\x10ListTasksRequest\x1a?\n\x11ListTasksResponse\x12*\n\x05tasks\x18\x01 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x05tasks\x1a\x62\n\x0fKillTaskRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x36\n\x0cterm_timeout\x18\x02 \x01(\x0b\x32\x13.iris.time.DurationR\x0btermTimeout\x1a|\n\x0eHealthResponse\x12\x18\n\x07healthy\x18\x01 \x01(\x08R\x07healthy\x12+\n\x06uptime\x18\x02 \x01(\x0b\x32\x13.iris.time.DurationR\x06uptime\x12#\n\rrunning_tasks\x18\x03 \x01(\x05R\x0crunningTasks\x1at\n\x16\x45xecInContainerRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x18\n\x07\x63ommand\x18\x02 \x03(\tR\x07\x63ommand\x12\'\n\x0ftimeout_seconds\x18\x03 \x01(\x05R\x0etimeoutSeconds\x1a|\n\x17\x45xecInContainerResponse\x12\x1b\n\texit_code\x18\x01 \x01(\x05R\x08\x65xitCode\x12\x16\n\x06stdout\x18\x02 \x01(\tR\x06stdout\x12\x16\n\x06stderr\x18\x03 \x01(\tR\x06stderr\x12\x14\n\x05\x65rror\x18\x04 \x01(\tR\x05\x65rror\x1a\r\n\x0bPingRequest\x1a\x9a\x01\n\x0cPingResponse\x12M\n\x11resource_snapshot\x18\x01 \x01(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x10resourceSnapshot\x12\x18\n\x07healthy\x18\x02 \x01(\x08R\x07healthy\x12!\n\x0chealth_error\x18\x03 \x01(\tR\x0bhealthError\x1a\x43\n\x11StartTasksRequest\x12.\n\x05tasks\x18\x01 \x03(\x0b\x32\x18.iris.job.RunTaskRequestR\x05tasks\x1a\x46\n\x12StartTasksResponse\x12\x30\n\x04\x61\x63ks\x18\x01 \x03(\x0b\x32\x1c.iris.cluster.Worker.TaskAckR\x04\x61\x63ks\x1aT\n\x07TaskAck\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x02 \x01(\x08R\x08\x61\x63\x63\x65pted\x12\x14\n\x05\x65rror\x18\x03 \x01(\tR\x05\x65rror\x1a-\n\x10StopTasksRequest\x12\x19\n\x08task_ids\x18\x01 \x03(\tR\x07taskIds\x1a\x13\n\x11StopTasksResponse\x1aU\n\x10PollTasksRequest\x12\x41\n\x0e\x65xpected_tasks\x18\x01 \x03(\x0b\x32\x1a.iris.job.WorkerTaskStatusR\rexpectedTasks\x1a\x45\n\x11PollTasksResponse\x12\x30\n\x05tasks\x18\x01 \x03(\x0b\x32\x1a.iris.job.WorkerTaskStatusR\x05tasks2\xc1\x07\n\rWorkerService\x12P\n\rGetTaskStatus\x12).iris.cluster.Worker.GetTaskStatusRequest\x1a\x14.iris.job.TaskStatus\x12Z\n\tListTasks\x12%.iris.cluster.Worker.ListTasksRequest\x1a&.iris.cluster.Worker.ListTasksResponse\x12\x43\n\x0bHealthCheck\x12\x0f.iris.job.Empty\x1a#.iris.cluster.Worker.HealthResponse\x12\x44\n\tHeartbeat\x12\x1a.iris.job.HeartbeatRequest\x1a\x1b.iris.job.HeartbeatResponse\x12J\n\x0bProfileTask\x12\x1c.iris.job.ProfileTaskRequest\x1a\x1d.iris.job.ProfileTaskResponse\x12Y\n\x10GetProcessStatus\x12!.iris.job.GetProcessStatusRequest\x1a\".iris.job.GetProcessStatusResponse\x12l\n\x0f\x45xecInContainer\x12+.iris.cluster.Worker.ExecInContainerRequest\x1a,.iris.cluster.Worker.ExecInContainerResponse\x12K\n\x04Ping\x12 .iris.cluster.Worker.PingRequest\x1a!.iris.cluster.Worker.PingResponse\x12]\n\nStartTasks\x12&.iris.cluster.Worker.StartTasksRequest\x1a\'.iris.cluster.Worker.StartTasksResponse\x12Z\n\tStopTasks\x12%.iris.cluster.Worker.StopTasksRequest\x1a&.iris.cluster.Worker.StopTasksResponse\x12Z\n\tPollTasks\x12%.iris.cluster.Worker.PollTasksRequest\x1a&.iris.cluster.Worker.PollTasksResponseBp\n\x10\x63om.iris.clusterB\x0bWorkerProtoP\x01\xa2\x02\x03ICX\xaa\x02\x0cIris.Cluster\xca\x02\x0cIris\\Cluster\xe2\x02\x18Iris\\Cluster\\GPBMetadata\xea\x02\rIris::Clusterb\x08\x65\x64itionsp\xe8\x07') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -35,7 +35,7 @@ _globals['DESCRIPTOR']._loaded_options = None _globals['DESCRIPTOR']._serialized_options = b'\n\020com.iris.clusterB\013WorkerProtoP\001\242\002\003ICX\252\002\014Iris.Cluster\312\002\014Iris\\Cluster\342\002\030Iris\\Cluster\\GPBMetadata\352\002\rIris::Cluster' _globals['_WORKER']._serialized_start=54 - _globals['_WORKER']._serialized_end=672 + _globals['_WORKER']._serialized_end=1297 _globals['_WORKER_GETTASKSTATUSREQUEST']._serialized_start=64 _globals['_WORKER_GETTASKSTATUSREQUEST']._serialized_end=117 _globals['_WORKER_LISTTASKSREQUEST']._serialized_start=119 @@ -50,6 +50,24 @@ _globals['_WORKER_EXECINCONTAINERREQUEST']._serialized_end=546 _globals['_WORKER_EXECINCONTAINERRESPONSE']._serialized_start=548 _globals['_WORKER_EXECINCONTAINERRESPONSE']._serialized_end=672 - _globals['_WORKERSERVICE']._serialized_start=675 - _globals['_WORKERSERVICE']._serialized_end=1280 + _globals['_WORKER_PINGREQUEST']._serialized_start=674 + _globals['_WORKER_PINGREQUEST']._serialized_end=687 + _globals['_WORKER_PINGRESPONSE']._serialized_start=690 + _globals['_WORKER_PINGRESPONSE']._serialized_end=844 + _globals['_WORKER_STARTTASKSREQUEST']._serialized_start=846 + _globals['_WORKER_STARTTASKSREQUEST']._serialized_end=913 + _globals['_WORKER_STARTTASKSRESPONSE']._serialized_start=915 + _globals['_WORKER_STARTTASKSRESPONSE']._serialized_end=985 + _globals['_WORKER_TASKACK']._serialized_start=987 + _globals['_WORKER_TASKACK']._serialized_end=1071 + _globals['_WORKER_STOPTASKSREQUEST']._serialized_start=1073 + _globals['_WORKER_STOPTASKSREQUEST']._serialized_end=1118 + _globals['_WORKER_STOPTASKSRESPONSE']._serialized_start=1120 + _globals['_WORKER_STOPTASKSRESPONSE']._serialized_end=1139 + _globals['_WORKER_POLLTASKSREQUEST']._serialized_start=1141 + _globals['_WORKER_POLLTASKSREQUEST']._serialized_end=1226 + _globals['_WORKER_POLLTASKSRESPONSE']._serialized_start=1228 + _globals['_WORKER_POLLTASKSRESPONSE']._serialized_end=1297 + _globals['_WORKERSERVICE']._serialized_start=1300 + _globals['_WORKERSERVICE']._serialized_end=2261 # @@protoc_insertion_point(module_scope) diff --git a/lib/iris/src/iris/rpc/worker_pb2.pyi b/lib/iris/src/iris/rpc/worker_pb2.pyi index 6a89706583..e02fc717ce 100644 --- a/lib/iris/src/iris/rpc/worker_pb2.pyi +++ b/lib/iris/src/iris/rpc/worker_pb2.pyi @@ -59,4 +59,53 @@ class Worker(_message.Message): stderr: str error: str def __init__(self, exit_code: _Optional[int] = ..., stdout: _Optional[str] = ..., stderr: _Optional[str] = ..., error: _Optional[str] = ...) -> None: ... + class PingRequest(_message.Message): + __slots__ = () + def __init__(self) -> None: ... + class PingResponse(_message.Message): + __slots__ = ("resource_snapshot", "healthy", "health_error") + RESOURCE_SNAPSHOT_FIELD_NUMBER: _ClassVar[int] + HEALTHY_FIELD_NUMBER: _ClassVar[int] + HEALTH_ERROR_FIELD_NUMBER: _ClassVar[int] + resource_snapshot: _job_pb2.WorkerResourceSnapshot + healthy: bool + health_error: str + def __init__(self, resource_snapshot: _Optional[_Union[_job_pb2.WorkerResourceSnapshot, _Mapping]] = ..., healthy: _Optional[bool] = ..., health_error: _Optional[str] = ...) -> None: ... + class StartTasksRequest(_message.Message): + __slots__ = ("tasks",) + TASKS_FIELD_NUMBER: _ClassVar[int] + tasks: _containers.RepeatedCompositeFieldContainer[_job_pb2.RunTaskRequest] + def __init__(self, tasks: _Optional[_Iterable[_Union[_job_pb2.RunTaskRequest, _Mapping]]] = ...) -> None: ... + class StartTasksResponse(_message.Message): + __slots__ = ("acks",) + ACKS_FIELD_NUMBER: _ClassVar[int] + acks: _containers.RepeatedCompositeFieldContainer[Worker.TaskAck] + def __init__(self, acks: _Optional[_Iterable[_Union[Worker.TaskAck, _Mapping]]] = ...) -> None: ... + class TaskAck(_message.Message): + __slots__ = ("task_id", "accepted", "error") + TASK_ID_FIELD_NUMBER: _ClassVar[int] + ACCEPTED_FIELD_NUMBER: _ClassVar[int] + ERROR_FIELD_NUMBER: _ClassVar[int] + task_id: str + accepted: bool + error: str + def __init__(self, task_id: _Optional[str] = ..., accepted: _Optional[bool] = ..., error: _Optional[str] = ...) -> None: ... + class StopTasksRequest(_message.Message): + __slots__ = ("task_ids",) + TASK_IDS_FIELD_NUMBER: _ClassVar[int] + task_ids: _containers.RepeatedScalarFieldContainer[str] + def __init__(self, task_ids: _Optional[_Iterable[str]] = ...) -> None: ... + class StopTasksResponse(_message.Message): + __slots__ = () + def __init__(self) -> None: ... + class PollTasksRequest(_message.Message): + __slots__ = ("expected_tasks",) + EXPECTED_TASKS_FIELD_NUMBER: _ClassVar[int] + expected_tasks: _containers.RepeatedCompositeFieldContainer[_job_pb2.WorkerTaskStatus] + def __init__(self, expected_tasks: _Optional[_Iterable[_Union[_job_pb2.WorkerTaskStatus, _Mapping]]] = ...) -> None: ... + class PollTasksResponse(_message.Message): + __slots__ = ("tasks",) + TASKS_FIELD_NUMBER: _ClassVar[int] + tasks: _containers.RepeatedCompositeFieldContainer[_job_pb2.WorkerTaskStatus] + def __init__(self, tasks: _Optional[_Iterable[_Union[_job_pb2.WorkerTaskStatus, _Mapping]]] = ...) -> None: ... def __init__(self) -> None: ... diff --git a/lib/iris/tests/cluster/controller/conftest.py b/lib/iris/tests/cluster/controller/conftest.py index a06c6190ea..0f1dbb4ebf 100644 --- a/lib/iris/tests/cluster/controller/conftest.py +++ b/lib/iris/tests/cluster/controller/conftest.py @@ -108,6 +108,22 @@ def profile_task( ) -> job_pb2.ProfileTaskResponse: raise ProviderUnsupportedError("fake") + # --- Split heartbeat surface (no-op stubs so split-mode tests can run) --- + + def ping_workers(self, workers): + return [] + + def start_tasks(self, jobs): + from iris.rpc import worker_pb2 + + return [(wid, worker_pb2.Worker.StartTasksResponse(), None) for wid, _, _ in jobs] + + def stop_tasks(self, jobs): + return [(wid, None) for wid, _, _ in jobs] + + def poll_workers(self, running, worker_addresses): + return [] + def close(self) -> None: pass