diff --git a/lib/iris/OPS.md b/lib/iris/OPS.md index 3496bea562..570c3ca110 100644 --- a/lib/iris/OPS.md +++ b/lib/iris/OPS.md @@ -118,9 +118,14 @@ SELECT slice_id, lifecycle, scale_group, worker_ids FROM slices WHERE lifecycle= -- Task attempt history (debugging retries) SELECT task_id, attempt_id, state, exit_code, error FROM task_attempts WHERE task_id LIKE '%%' ORDER BY attempt_id; +``` + +Controller audit events (`event= action= entity= ...`) are +emitted as structured `logger.info` lines — query them through +`iris process logs` with a substring filter, not via SQL. Example: --- What the controller has been doing -SELECT kind, count(*) FROM txn_log GROUP BY kind ORDER BY count(*) DESC LIMIT 10; +```bash +iris process logs --since 24h | grep 'action=worker_heartbeat_failed' ``` Full table list: `iris query "SELECT name FROM sqlite_master WHERE type='table'"`. diff --git a/lib/iris/dashboard/src/App.vue b/lib/iris/dashboard/src/App.vue index 193879bee0..e843066c72 100644 --- a/lib/iris/dashboard/src/App.vue +++ b/lib/iris/dashboard/src/App.vue @@ -20,7 +20,6 @@ const WORKER_TABS: Tab[] = [ { key: 'fleet', label: 'Workers', to: '/fleet' }, { key: 'endpoints', label: 'Endpoints', to: '/endpoints' }, { key: 'autoscaler', label: 'Autoscaler', to: '/autoscaler' }, - { key: 'transactions', label: 'Transactions', to: '/transactions' }, { key: 'account', label: 'Account', to: '/account' }, { key: 'status', label: 'Status', to: '/status' }, ] @@ -30,7 +29,6 @@ const KUBERNETES_TABS: Tab[] = [ { key: 'scheduler', label: 'Scheduler', to: '/scheduler' }, { key: 'cluster', label: 'Cluster', to: '/cluster' }, { key: 'endpoints', label: 'Endpoints', to: '/endpoints' }, - { key: 'transactions', label: 'Transactions', to: '/transactions' }, { key: 'account', label: 'Account', to: '/account' }, { key: 'status', label: 'Status', to: '/status' }, ] @@ -46,7 +44,6 @@ const PATH_TO_TAB: Record = { '/cluster': 'cluster', '/endpoints': 'endpoints', '/autoscaler': 'autoscaler', - '/transactions': 'transactions', '/account': 'account', '/status': 'status', } diff --git a/lib/iris/dashboard/src/components/controller/TransactionsTab.vue b/lib/iris/dashboard/src/components/controller/TransactionsTab.vue deleted file mode 100644 index 7decc08cbe..0000000000 --- a/lib/iris/dashboard/src/components/controller/TransactionsTab.vue +++ /dev/null @@ -1,102 +0,0 @@ - - - diff --git a/lib/iris/dashboard/src/router.ts b/lib/iris/dashboard/src/router.ts index 13c2422ce4..6d598d495f 100644 --- a/lib/iris/dashboard/src/router.ts +++ b/lib/iris/dashboard/src/router.ts @@ -30,10 +30,6 @@ const routes = [ path: '/status', component: () => import('./components/controller/StatusTab.vue'), }, - { - path: '/transactions', - component: () => import('./components/controller/TransactionsTab.vue'), - }, { path: '/scheduler', component: () => import('./components/controller/SchedulerTab.vue'), diff --git a/lib/iris/dashboard/src/types/rpc.ts b/lib/iris/dashboard/src/types/rpc.ts index e1de5f46d3..b9d037901c 100644 --- a/lib/iris/dashboard/src/types/rpc.ts +++ b/lib/iris/dashboard/src/types/rpc.ts @@ -458,19 +458,6 @@ export interface GetProcessStatusResponse { logEntries?: LogEntry[] } -// -- Transactions -- - -export interface TransactionAction { - timestamp?: ProtoTimestamp - action?: string - entityId?: string - details?: string -} - -export interface GetTransactionsResponse { - actions: TransactionAction[] -} - // -- Task State Counts (used in job summaries and user summaries) -- /** Mapping from lowercase state name to count, e.g. { running: 2, pending: 5 } */ diff --git a/lib/iris/scripts/benchmark_db_queries.py b/lib/iris/scripts/benchmark_db_queries.py index d87e255612..3d37ca70a3 100644 --- a/lib/iris/scripts/benchmark_db_queries.py +++ b/lib/iris/scripts/benchmark_db_queries.py @@ -102,8 +102,6 @@ "task_resource_history", "endpoints", "reservation_claims", - "txn_log", - "txn_actions", "meta", "schema_migrations", ] diff --git a/lib/iris/src/iris/cluster/controller/auth.py b/lib/iris/src/iris/cluster/controller/auth.py index 4442df0bde..1830008eb3 100644 --- a/lib/iris/src/iris/cluster/controller/auth.py +++ b/lib/iris/src/iris/cluster/controller/auth.py @@ -56,6 +56,13 @@ def create_api_key( "VALUES (?, ?, ?, ?, ?, ?, ?)", (key_id, key_hash, key_prefix, user_id, name, now.epoch_ms(), expires_at.epoch_ms() if expires_at else None), ) + logger.info( + "event=api_key_created entity=%s trigger=- user=%s name=%s expires_at_ms=%s", + key_id, + user_id, + name, + expires_at.epoch_ms() if expires_at else "-", + ) def lookup_api_key_by_hash(db: ControllerDB, key_hash: str) -> ApiKeyRow | None: @@ -81,7 +88,10 @@ def revoke_api_key(db: ControllerDB, key_id: str, now: Timestamp) -> bool: f"UPDATE {db.api_keys_table} SET revoked_at_ms = ? WHERE key_id = ? AND revoked_at_ms IS NULL", (now.epoch_ms(), key_id), ) - return cur._cursor.rowcount > 0 + revoked = cur._cursor.rowcount > 0 + if revoked: + logger.info("event=api_key_revoked entity=%s trigger=-", key_id) + return revoked def list_api_keys(db: ControllerDB, user_id: str | None = None) -> list[ApiKeyRow]: @@ -111,6 +121,11 @@ def revoke_login_keys_for_user(db: ControllerDB, user_id: str, now: Timestamp) - " WHERE user_id = ? AND name LIKE 'login-%' AND revoked_at_ms IS NULL", (now.epoch_ms(), user_id), ) + logger.info( + "event=login_keys_revoked entity=%s trigger=- count=%d", + user_id, + len(revoked_ids), + ) return revoked_ids diff --git a/lib/iris/src/iris/cluster/controller/controller.py b/lib/iris/src/iris/cluster/controller/controller.py index d525a3a7d3..e60d16020a 100644 --- a/lib/iris/src/iris/cluster/controller/controller.py +++ b/lib/iris/src/iris/cluster/controller/controller.py @@ -101,6 +101,7 @@ ReservationClaim, SchedulingEvent, TaskUpdate, + log_event, ) from iris.cluster.controller.worker_health import WorkerHealthTracker from iris.cluster.log_store import CONTROLLER_LOG_KEY @@ -936,9 +937,6 @@ class ControllerConfig: worker_retention: Duration = field(default_factory=lambda: Duration.from_seconds(86400)) """Delete inactive/unhealthy workers whose last heartbeat exceeds this (default: 24 hours).""" - txn_action_retention: Duration = field(default_factory=lambda: Duration.from_seconds(3 * 86400)) - """Delete txn_actions older than this (default: 3 days).""" - profile_retention: Duration = field(default_factory=lambda: Duration.from_seconds(86400)) """Delete task_profiles older than this (default: 24 hours).""" @@ -1419,7 +1417,6 @@ def _run_prune_loop(self, stop_event: threading.Event) -> None: self._transitions.prune_old_data( job_retention=self._config.job_retention, worker_retention=self._config.worker_retention, - txn_action_retention=self._config.txn_action_retention, profile_retention=self._config.profile_retention, stop_event=stop_event, ) @@ -1655,6 +1652,7 @@ def _cleanup_stale_claims(self, claims: dict[WorkerId, ReservationClaim] | None del claims[wid] if stale and persisted: self._transitions.replace_reservation_claims(claims) + log_event("reservation_claims_cleaned", "controller", count=len(stale)) return bool(stale) def _claim_workers_for_reservations(self, claims: dict[WorkerId, ReservationClaim] | None = None) -> bool: @@ -1702,6 +1700,7 @@ def _claim_workers_for_reservations(self, claims: dict[WorkerId, ReservationClai break if changed and persisted: self._transitions.replace_reservation_claims(claims) + log_event("reservation_claims_updated", "controller", total_claims=len(claims)) return changed def _run_scheduling(self) -> SchedulingOutcome: @@ -1772,6 +1771,14 @@ def _run_scheduling(self) -> SchedulingOutcome: self._cache_scheduling_diagnostics(context, tainted_jobs, all_assignments, order.ordered_task_ids) if all_assignments or preemptions: + log_event( + "scheduling_pass_completed", + "scheduler", + assignments=len(all_assignments), + preempted=len(preemptions), + pending=len(state.pending_tasks), + workers=len(state.workers), + ) return SchedulingOutcome.ASSIGNMENTS_MADE return SchedulingOutcome.NO_ASSIGNMENTS @@ -2188,8 +2195,13 @@ def _dispatch_assignments_direct( # the task state machine bounces it back to PENDING — see # transitions._apply_task_transition: WORKER_FAILED from ASSIGNED # rolls the task to PENDING without consuming a preemption retry. - logger.warning("StartTasks RPC failed for worker %s: %s", worker_id, error) - summary = f"StartTasks RPC failed: {error}" + log_event( + "dispatch_failed", + str(worker_id), + trigger="start_tasks_rpc", + task_count=len(tasks_by_worker.get(worker_id, [])), + error=error, + ) self._task_update_queue.put( HeartbeatApplyRequest( worker_id=worker_id, @@ -2199,7 +2211,7 @@ def _dispatch_assignments_direct( task_id=JobName.from_wire(t.task_id), attempt_id=attempt_by_worker_task.get((worker_id, t.task_id), -1), new_state=job_pb2.TASK_STATE_WORKER_FAILED, - error=summary, + error=f"StartTasks RPC failed: {error}", ) for t in tasks_by_worker.get(worker_id, []) ], @@ -2209,7 +2221,13 @@ def _dispatch_assignments_direct( assert response is not None for ack in response.acks: if not ack.accepted: - logger.warning("Worker %s rejected task %s: %s", worker_id, ack.task_id, ack.error) + log_event( + "task_rejected", + ack.task_id, + trigger="start_tasks_ack", + worker=str(worker_id), + error=ack.error, + ) self._task_update_queue.put( HeartbeatApplyRequest( worker_id=worker_id, @@ -2367,6 +2385,8 @@ def _terminate_workers(self, worker_ids: list[str], reason: str, sibling_reason: Returns the set of worker_ids that were actually removed (primary + siblings), so callers can drop them from in-memory state like the health tracker. """ + for wid in worker_ids: + log_event("worker_failing", wid, trigger=reason) failure_result = self._transitions.fail_workers_batch(worker_ids, reason=reason) removed: list[WorkerId] = [] for wid, addr in failure_result.removed_workers: @@ -2376,6 +2396,8 @@ def _terminate_workers(self, worker_ids: list[str], reason: str, sibling_reason: sibling_worker_ids = self._autoscaler.terminate_slices_for_workers( [str(wid) for wid, _ in failure_result.removed_workers] ) + for wid in sibling_worker_ids: + log_event("worker_failing", str(wid), trigger=sibling_reason) sibling_failures = self._transitions.fail_workers_batch( sibling_worker_ids, reason=sibling_reason, @@ -2466,32 +2488,12 @@ def _handle_failed_heartbeats( acc.all_tasks_to_kill.update(failure_result.tasks_to_kill) acc.all_task_kill_workers.update(failure_result.task_kill_workers) - for (batch, error), result in zip(failure_entries, failure_result.results, strict=False): - last_success_age_s = ( - "unknown" if result.last_heartbeat_age_ms is None else f"{result.last_heartbeat_age_ms / 1000.0:.1f}" - ) - logger.warning( - "Heartbeat RPC failure: worker=%s address=%s action=%s last_success_age_s=%s " - "expected=%d run=%d kill=%d error=%s", - batch.worker_id, - batch.worker_address or "", - result.action.value, - last_success_age_s, - len(batch.running_tasks), - len(batch.tasks_to_run), - len(batch.tasks_to_kill), - error, - ) + for batch, _error in failure_entries: acc.fail_count += 1 acc.transient_failed_workers.append(batch.worker_id) unhealthy = self._health.workers_over_threshold() if unhealthy: - logger.warning( - "Failing %d workers over ping threshold: %s", - len(unhealthy), - [str(wid) for wid in unhealthy[:10]], - ) removed = self._terminate_workers( [str(wid) for wid in unhealthy], reason="worker ping threshold exceeded", @@ -2512,6 +2514,8 @@ def _handle_sibling_worker_failures( if not self._autoscaler or not primary_failed_workers: return sibling_worker_ids = self._autoscaler.terminate_slices_for_workers(primary_failed_workers) + for wid in sibling_worker_ids: + log_event("worker_failing", str(wid), trigger="sibling_slice_terminated") # TODO(#3425): This prunes sibling workers before their in-flight # results are processed, causing apply_heartbeat() to # silently drop any logs/states those workers reported this round. @@ -2526,11 +2530,6 @@ def _handle_sibling_worker_failures( if sibling_failures.removed_workers: acc.fail_count += len(sibling_failures.removed_workers) acc.terminal_failed_workers.extend(wid for wid, _ in sibling_failures.removed_workers) - logger.info( - "Failed %d sibling workers from slices: %s", - len(sibling_failures.removed_workers), - [wid for wid, _ in sibling_failures.removed_workers], - ) def _log_sync_health_summary( self, @@ -2541,28 +2540,15 @@ def _log_sync_health_summary( elapsed_ms: int, ) -> None: """Log provider sync timing and periodic cluster health summary.""" - level = logging.WARNING if elapsed_ms > _SLOW_HEARTBEAT_MS else logging.DEBUG - logger.log( - level, - "Provider sync: %d workers, %d failed (%d transient, %d terminal), %dms", - batch_count, - fail_count, - len(transient_failed_workers), - len(terminal_failed_workers), - elapsed_ms, - ) - if transient_failed_workers: - logger.log( - level, - "Provider sync transient failures (%d): [%s]", - len(transient_failed_workers), - ", ".join(transient_failed_workers), - ) - if terminal_failed_workers: - logger.warning( - "Provider sync terminal failures (%d): [%s]", - len(terminal_failed_workers), - ", ".join(terminal_failed_workers), + if elapsed_ms > _SLOW_HEARTBEAT_MS or transient_failed_workers or terminal_failed_workers: + log_event( + "provider_sync", + "controller", + workers=batch_count, + failed=fail_count, + transient=len(transient_failed_workers), + terminal=len(terminal_failed_workers), + elapsed_ms=elapsed_ms, ) self._heartbeat_iteration += 1 @@ -2573,12 +2559,13 @@ def _log_sync_health_summary( 0 ] # type: ignore[index] pending = len(_schedulable_tasks(self._db)) - logger.info( - "Controller status: %d workers (%d failed), %d active jobs, %d pending tasks", - len(workers), - fail_count, - active, - pending, + log_event( + "controller_status", + "controller", + workers=len(workers), + failed=fail_count, + active_jobs=active, + pending_tasks=pending, ) def _run_autoscaler_once(self) -> None: @@ -2638,12 +2625,13 @@ def begin_checkpoint(self) -> tuple[str, CheckpointResult]: path, result = upload_checkpoint(self._db, backup, self._config.remote_state_dir) finally: backup.cleanup() - logger.info( - "Checkpoint written: %s (jobs=%d tasks=%d workers=%d)", - path, - result.job_count, - result.task_count, - result.worker_count, + log_event( + "checkpoint_written", + "controller", + path=path, + jobs=result.job_count, + tasks=result.task_count, + workers=result.worker_count, ) return path, result diff --git a/lib/iris/src/iris/cluster/controller/migrations/0004_worker_indexes.py b/lib/iris/src/iris/cluster/controller/migrations/0004_worker_indexes.py index c24b067894..4f77b0acbf 100644 --- a/lib/iris/src/iris/cluster/controller/migrations/0004_worker_indexes.py +++ b/lib/iris/src/iris/cluster/controller/migrations/0004_worker_indexes.py @@ -5,18 +5,9 @@ def migrate(conn: sqlite3.Connection) -> None: - conn.executescript( - """ -DROP TRIGGER IF EXISTS trg_txn_log_retention; -CREATE TRIGGER IF NOT EXISTS trg_txn_log_retention -AFTER INSERT ON txn_log -WHEN (SELECT COUNT(*) FROM txn_log) > 1100 -BEGIN - DELETE FROM txn_log WHERE id <= ( - SELECT id FROM txn_log ORDER BY id DESC LIMIT 1 OFFSET 1000 - ); -END; - -CREATE INDEX IF NOT EXISTS idx_workers_healthy_active ON workers(healthy, active); -""" - ) + # Originally this migration also rewrote the `trg_txn_log_retention` + # trigger; those statements were removed once migration 0037 dropped the + # `txn_log` / `txn_actions` tables entirely. On DBs that already ran the + # old form the trigger survives until 0037 executes; 0037 is idempotent + # (`DROP TRIGGER IF EXISTS`) so no fixup is needed here. + conn.execute("CREATE INDEX IF NOT EXISTS idx_workers_healthy_active ON workers(healthy, active)") diff --git a/lib/iris/src/iris/cluster/controller/migrations/0037_drop_txn_log_and_txn_actions.py b/lib/iris/src/iris/cluster/controller/migrations/0037_drop_txn_log_and_txn_actions.py new file mode 100644 index 0000000000..328c0aa6e4 --- /dev/null +++ b/lib/iris/src/iris/cluster/controller/migrations/0037_drop_txn_log_and_txn_actions.py @@ -0,0 +1,19 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Drop the `txn_log` and `txn_actions` audit tables. + +The controller no longer records mutating transitions to SQLite; every +state change now emits a semi-structured `logger.info` line via +``transitions.log_event`` that is captured by the Iris log server and +queried through the normal log-store DuckDB interface. +""" + +import sqlite3 + + +def migrate(conn: sqlite3.Connection) -> None: + conn.execute("DROP TRIGGER IF EXISTS trg_txn_log_retention") + conn.execute("DROP INDEX IF EXISTS idx_txn_actions_txn") + conn.execute("DROP TABLE IF EXISTS txn_actions") + conn.execute("DROP TABLE IF EXISTS txn_log") diff --git a/lib/iris/src/iris/cluster/controller/schema.py b/lib/iris/src/iris/cluster/controller/schema.py index 617e609220..594af63c60 100644 --- a/lib/iris/src/iris/cluster/controller/schema.py +++ b/lib/iris/src/iris/cluster/controller/schema.py @@ -1112,62 +1112,6 @@ def generate_full_ddl(tables: Sequence[Table]) -> str: indexes=("CREATE INDEX IF NOT EXISTS idx_dispatch_worker ON dispatch_queue(worker_id, id)",), ) -TXN_LOG = Table( - "txn_log", - "tl", - columns=( - Column("id", "INTEGER", "PRIMARY KEY AUTOINCREMENT"), - Column("kind", "TEXT", "NOT NULL", python_type=str, decoder=str), - Column("payload_json", "TEXT", "NOT NULL", python_name="payload", python_type=dict, decoder=_decode_json_dict), - Column( - "created_at_ms", - "INTEGER", - "NOT NULL", - python_name="created_at", - python_type=Timestamp, - decoder=decode_timestamp_ms, - ), - ), - triggers=( - # Migration 0004_worker_indexes rewrote the trigger from 0001 - """CREATE TRIGGER IF NOT EXISTS trg_txn_log_retention -AFTER INSERT ON txn_log -WHEN (SELECT COUNT(*) FROM txn_log) > 1100 -BEGIN - DELETE FROM txn_log WHERE id <= ( - SELECT id FROM txn_log ORDER BY id DESC LIMIT 1 OFFSET 1000 - ); -END;""", - ), -) - -TXN_ACTIONS = Table( - "txn_actions", - "ta2", - columns=( - Column("id", "INTEGER", "PRIMARY KEY AUTOINCREMENT"), - Column( - "txn_id", - "INTEGER", - "NOT NULL REFERENCES txn_log(id) ON DELETE CASCADE", - python_type=int, - decoder=int, - ), - Column("action", "TEXT", "NOT NULL", python_type=str, decoder=str), - Column("entity_id", "TEXT", "NOT NULL", python_type=str, decoder=str), - Column("details_json", "TEXT", "NOT NULL", python_name="details", python_type=dict, decoder=_decode_json_dict), - Column( - "created_at_ms", - "INTEGER", - "NOT NULL", - python_name="timestamp", - python_type=Timestamp, - decoder=decode_timestamp_ms, - ), - ), - indexes=("CREATE INDEX IF NOT EXISTS idx_txn_actions_txn ON txn_actions(txn_id, id)",), -) - # Migration 0003: restructured scaling_groups SCALING_GROUPS = Table( "scaling_groups", @@ -1370,8 +1314,6 @@ def generate_full_ddl(tables: Sequence[Table]) -> str: TASK_RESOURCE_HISTORY, ENDPOINTS, DISPATCH_QUEUE, - TXN_LOG, - TXN_ACTIONS, SCALING_GROUPS, SLICES, RESERVATION_CLAIMS, @@ -1632,16 +1574,6 @@ class EndpointRow: registered_at: Timestamp -@dataclass(frozen=True, slots=True) -class TransactionActionRow: - """Transaction action log entry.""" - - timestamp: Timestamp - action: str - entity_id: str - details: dict - - @dataclass(frozen=True, slots=True) class ApiKeyRow: """API key record.""" @@ -1947,15 +1879,6 @@ def _job_columns(*names: str) -> tuple[tuple[Column, ...], tuple[str, ...]]: row_cls=EndpointRow, ) -# Transaction action row. -TXN_ACTION_PROJECTION = TXN_ACTIONS.projection( - "created_at_ms", - "action", - "entity_id", - "details_json", - row_cls=TransactionActionRow, -) - # API key row. API_KEY_PROJECTION = AUTH_API_KEYS.projection( "key_id", diff --git a/lib/iris/src/iris/cluster/controller/service.py b/lib/iris/src/iris/cluster/controller/service.py index 45907ab912..c9362a50dd 100644 --- a/lib/iris/src/iris/cluster/controller/service.py +++ b/lib/iris/src/iris/cluster/controller/service.py @@ -74,14 +74,12 @@ JOB_ROW_PROJECTION, TASK_DETAIL_PROJECTION, TASK_ROW_PROJECTION, - TXN_ACTION_PROJECTION, WORKER_DETAIL_PROJECTION, AttemptRow, EndpointRow, JobDetailRow, JobRow, TaskDetailRow, - TransactionActionRow, WorkerDetailRow, WorkerRow, tasks_with_attempts, @@ -121,9 +119,6 @@ logger = logging.getLogger(__name__) -DEFAULT_TRANSACTION_LIMIT = 50 - - DEFAULT_MAX_TOTAL_LINES = 100000 # Maximum bundle size in bytes (25 MB) - matches client-side limit @@ -798,17 +793,6 @@ def _descendant_jobs(db: ControllerDB, job_id: JobName) -> list[JobDetailRow]: ) -def _transaction_actions(db: ControllerDB, limit: int = 100) -> list[TransactionActionRow]: - with db.read_snapshot() as q: - actions = TXN_ACTION_PROJECTION.decode( - q.fetchall( - f"SELECT {TXN_ACTION_PROJECTION.select_clause()} " "FROM txn_actions ta2 ORDER BY ta2.id DESC LIMIT ?", - (limit,), - ), - ) - return list(reversed(actions)) - - def _live_user_stats(db: ControllerDB) -> list[UserStats]: """Aggregate job/task counts per user for active (non-terminal) jobs.""" active_states = ",".join( @@ -1963,27 +1947,6 @@ def profile_task( error=resp.error, ) - # --- Transactions --- - - def get_transactions( - self, - request: controller_pb2.Controller.GetTransactionsRequest, - ctx: Any, - ) -> controller_pb2.Controller.GetTransactionsResponse: - """Get recent controller actions for the dashboard action log.""" - limit = request.limit if request.limit > 0 else DEFAULT_TRANSACTION_LIMIT - actions = [] - for action in _transaction_actions(self._db, limit=limit): - details_str = json.dumps(action.details) if action.details else "" - proto_action = controller_pb2.Controller.TransactionAction( - action=action.action, - entity_id=action.entity_id, - details=details_str, - ) - proto_action.timestamp.CopyFrom(timestamp_to_proto(action.timestamp)) - actions.append(proto_action) - return controller_pb2.Controller.GetTransactionsResponse(actions=actions) - def list_users( self, request: controller_pb2.Controller.ListUsersRequest, diff --git a/lib/iris/src/iris/cluster/controller/transitions.py b/lib/iris/src/iris/cluster/controller/transitions.py index 6b29c88a48..6dc9172bc3 100644 --- a/lib/iris/src/iris/cluster/controller/transitions.py +++ b/lib/iris/src/iris/cluster/controller/transitions.py @@ -62,6 +62,37 @@ logger = logging.getLogger(__name__) +def log_event( + action: str, + entity_id: str, + *, + trigger: str | None = None, + **details: object, +) -> None: + """Emit a semi-structured audit line for a controller state transition. + + Each call produces one ``logger.info`` line of the shape:: + + event= entity= trigger= k=v ... + + ``trigger`` names the upstream event when this is derived (e.g. + ``trigger=heartbeat_applied`` on cascaded job terminations); callers omit + it for externally-caused events and the line renders ``trigger=-``. + + These lines are captured by the Iris log server and queried via the normal + ``iris process logs`` / log-store DuckDB interface — there is no SQLite + audit table. + """ + extras = " ".join(f"{k}={v}" for k, v in details.items() if v is not None) + logger.info( + "event=%s entity=%s trigger=%s %s", + action, + entity_id, + trigger or "-", + extras, + ) + + @dataclass(frozen=True) class ReservationClaim: """A claim binding a worker to a specific reservation entry. @@ -142,12 +173,11 @@ class PruneResult: jobs_deleted: int = 0 workers_deleted: int = 0 - txn_actions_deleted: int = 0 profiles_deleted: int = 0 @property def total(self) -> int: - return self.jobs_deleted + self.workers_deleted + self.txn_actions_deleted + self.profiles_deleted + return self.jobs_deleted + self.workers_deleted + self.profiles_deleted class HeartbeatAction(enum.Enum): @@ -975,26 +1005,6 @@ def __init__( self._user_budget_defaults = user_budget_defaults or UserBudgetDefaults() self._health = health or WorkerHealthTracker() - def _record_transaction( - self, - cur: Any, - kind: str, - actions: list[tuple[str, str, dict[str, object]]], - *, - payload: dict[str, object] | None = None, - ) -> None: - created_ms = Timestamp.now().epoch_ms() - cur.execute( - "INSERT INTO txn_log(kind, payload_json, created_at_ms) VALUES (?, ?, ?)", - (kind, json.dumps(payload or {}), created_ms), - ) - txn_id = int(cur.lastrowid) - for action, entity_id, details in actions: - cur.execute( - "INSERT INTO txn_actions(txn_id, action, entity_id, details_json, created_at_ms) VALUES (?, ?, ?, ?, ?)", - (txn_id, action, entity_id, json.dumps(details), created_ms), - ) - def _recompute_job_state(self, cur: Any, job_id: JobName) -> int | None: row = cur.execute( f"SELECT j.state, j.started_at_ms, jc.max_task_failures " @@ -1095,7 +1105,6 @@ def submit_job( ) -> SubmitJobResult: """Submit a job and expand its tasks in one DB transaction.""" submitted_ms = ts.epoch_ms() - actions: list[tuple[str, str, dict[str, object]]] = [] created_task_ids: list[JobName] = [] with self._db.transaction() as cur: @@ -1395,8 +1404,7 @@ def submit_job( ), ) - actions.append(("job_submitted", job_id.to_wire(), {"num_tasks": replicas, "error": validation_error})) - self._record_transaction(cur, "submit_job", actions) + log_event("job_submitted", job_id.to_wire(), num_tasks=replicas, error=validation_error) return SubmitJobResult(job_id=job_id, task_ids=created_task_ids) def cancel_job(self, job_id: JobName, reason: str) -> TxResult: @@ -1480,8 +1488,8 @@ def cancel_job(self, job_id: JobName, reason: str) -> TxResult: ), ) self._db.endpoints.remove_by_job_ids(cur, [JobName.from_wire(jid) for jid in subtree_ids]) - self._record_transaction(cur, "cancel_job", [("job_cancelled", job_id.to_wire(), {"reason": reason})]) - return TxResult(tasks_to_kill=tasks_to_kill, task_kill_workers=task_kill_workers) + log_event("job_cancelled", job_id.to_wire(), reason=reason) + return TxResult(tasks_to_kill=tasks_to_kill, task_kill_workers=task_kill_workers) def register_or_refresh_worker( self, @@ -1582,9 +1590,7 @@ def register_or_refresh_worker( "VALUES (?, ?, ?, ?, ?, ?)", (str(worker_id), key, value_type, str_value, int_value, float_value), ) - self._record_transaction( - cur, "register_worker", [("worker_registered", str(worker_id), {"address": address})] - ) + log_event("worker_registered", str(worker_id), address=address) # Update in-memory attribute cache so scheduling sees the new worker immediately. attr_dict: dict[str, AttributeValue] = {} for key, value_type, str_value, int_value, float_value in attrs: @@ -1729,9 +1735,8 @@ def queue_assignments(self, assignments: list[Assignment], *, direct_dispatch: b "started_at_ms = COALESCE(started_at_ms, ?) WHERE job_id = ?", (job_pb2.JOB_STATE_PENDING, job_pb2.JOB_STATE_RUNNING, now_ms, job_id_wire), ) - if accepted or rejected: - actions = [("assignment_queued", a.task_id.to_wire(), {"worker_id": str(a.worker_id)}) for a in accepted] - self._record_transaction(cur, "queue_assignments", actions) + for a in accepted: + log_event("assignment_queued", a.task_id.to_wire(), worker=str(a.worker_id)) return AssignmentResult( tasks_to_kill=set(), has_real_dispatch=has_real_dispatch, @@ -2000,10 +2005,9 @@ def _apply_task_transitions( task_kill_workers.update(final_task_kill_workers) cascaded_jobs.add(job_id) if tasks_to_kill or cascaded_jobs: - actions: list[tuple[str, str, dict[str, object]]] = [("heartbeat_applied", str(req.worker_id), {})] + log_event("heartbeat_applied", str(req.worker_id)) for job_id in cascaded_jobs: - actions.append(("job_terminated", job_id.to_wire(), {})) - self._record_transaction(cur, "apply_task_updates", actions) + log_event("job_terminated", job_id.to_wire(), trigger="heartbeat_applied") return TxResult( tasks_to_kill=tasks_to_kill, @@ -2277,11 +2281,7 @@ def record_heartbeat_failure( drained_dispatch, force_remove=force_remove, ) - self._record_transaction( - cur, - "heartbeat_failure", - [("worker_heartbeat_failed", str(worker_id), {"error": error})], - ) + log_event("worker_heartbeat_failed", str(worker_id), error=error) if result.worker_removed: self._db.remove_worker_from_attr_cache(worker_id) return TxResult(tasks_to_kill=result.tasks_to_kill, task_kill_workers=result.task_kill_workers) @@ -2302,11 +2302,7 @@ def fail_heartbeat_for_worker( snapshot, force_remove=force_remove, ) - self._record_transaction( - cur, - "heartbeat_failure", - [("worker_heartbeat_failed", str(worker_id), {"error": error})], - ) + log_event("worker_heartbeat_failed", str(worker_id), error=error) if result.worker_removed: self._db.remove_worker_from_attr_cache(worker_id) return result @@ -2342,7 +2338,6 @@ def fail_heartbeats_batch( for chunk_start in range(0, len(failures), chunk_size): chunk = failures[chunk_start : chunk_start + chunk_size] - chunk_actions: list[tuple[str, str, dict[str, object]]] = [] with self._db.transaction() as cur: now_ms = Timestamp.now().epoch_ms() for snapshot, error in chunk: @@ -2355,17 +2350,19 @@ def fail_heartbeats_batch( now_ms=now_ms, ) results.append(result) - chunk_actions.append(("worker_heartbeat_failed", str(snapshot.worker_id), {"error": error})) + log_event( + "worker_heartbeat_failed", + str(snapshot.worker_id), + address=snapshot.worker_address or "-", + rpc_action=result.action.value, + last_success_age_ms=result.last_heartbeat_age_ms or "-", + running=len(snapshot.running_tasks), + error=error, + ) all_tasks_to_kill.update(result.tasks_to_kill) all_task_kill_workers.update(result.task_kill_workers) if result.worker_removed: removed_workers.append((snapshot.worker_id, snapshot.worker_address)) - self._record_transaction( - cur, - "heartbeat_failures_batch", - chunk_actions, - payload={"count": len(chunk_actions)}, - ) for worker_id, _ in removed_workers: self._db.remove_worker_from_attr_cache(worker_id) @@ -2393,9 +2390,7 @@ def mark_task_unschedulable(self, task_id: JobName, reason: str) -> TxResult: now_ms, ) self._recompute_job_state(cur, JobName.from_wire(str(row["job_id"]))) - self._record_transaction( - cur, "mark_task_unschedulable", [("task_unschedulable", task_id.to_wire(), {"reason": reason})] - ) + log_event("task_unschedulable", task_id.to_wire(), reason=reason) return TxResult() def preempt_task(self, task_id: JobName, reason: str) -> TxResult: @@ -2484,7 +2479,7 @@ def preempt_task(self, task_id: JobName, reason: str) -> TxResult: if new_state == job_pb2.TASK_STATE_PREEMPTED: tasks_to_kill.add(task_id) - self._record_transaction(cur, "preempt_task", [("task_preempted", task_id.to_wire(), {"reason": reason})]) + log_event("task_preempted", task_id.to_wire(), reason=reason) return TxResult(tasks_to_kill=tasks_to_kill, task_kill_workers=task_kill_workers) @@ -2613,11 +2608,8 @@ def cancel_tasks_for_timeout(self, task_ids: set[JobName], reason: str) -> TxRes ) tasks_to_kill.update(final_kill) task_kill_workers.update(final_workers) - self._record_transaction( - cur, - "cancel_tasks_for_timeout", - [("task_timeout", tid.to_wire(), {"reason": reason}) for tid in tasks_to_kill], - ) + for tid in tasks_to_kill: + log_event("task_timeout", tid.to_wire(), reason=reason) return TxResult(tasks_to_kill=tasks_to_kill, task_kill_workers=task_kill_workers) def drain_dispatch(self, worker_id: WorkerId) -> DispatchBatch | None: @@ -2803,8 +2795,8 @@ def remove_finished_job(self, job_id: JobName) -> bool: ): return False cur.execute("DELETE FROM jobs WHERE job_id = ?", (job_id.to_wire(),)) - self._record_transaction(cur, "remove_finished_job", [("job_removed", job_id.to_wire(), {"state": state})]) - return True + log_event("job_removed", job_id.to_wire(), state=state) + return True def remove_worker(self, worker_id: WorkerId) -> WorkerDetailRow | None: with self._db.transaction() as cur: @@ -2812,7 +2804,7 @@ def remove_worker(self, worker_id: WorkerId) -> WorkerDetailRow | None: if row is None: return None _remove_worker(cur, str(worker_id)) - self._record_transaction(cur, "remove_worker", [("worker_removed", str(worker_id), {})]) + log_event("worker_removed", str(worker_id)) self._db.remove_worker_from_attr_cache(worker_id) return WORKER_DETAIL_PROJECTION.decode_one([row]) @@ -2965,7 +2957,6 @@ def prune_old_data( *, job_retention: Duration, worker_retention: Duration, - txn_action_retention: Duration, profile_retention: Duration, stop_event: threading.Event | None = None, pause_between_s: float = 1.0, @@ -2979,7 +2970,6 @@ def prune_old_data( Args: job_retention: Delete terminal jobs whose finished_at is older than this. worker_retention: Delete inactive/unhealthy workers whose last heartbeat is older than this. - txn_action_retention: Delete txn_actions older than this. profile_retention: Delete task_profiles older than this. stop_event: If set, abort early (e.g. during shutdown). pause_between_s: Sleep between individual deletes to reduce lock contention. @@ -2987,7 +2977,6 @@ def prune_old_data( now_ms = Timestamp.now().epoch_ms() job_cutoff_ms = now_ms - job_retention.to_ms() worker_cutoff_ms = now_ms - worker_retention.to_ms() - txn_cutoff_ms = now_ms - txn_action_retention.to_ms() terminal_states = tuple(TERMINAL_JOB_STATES) placeholders = ",".join("?" * len(terminal_states)) @@ -3012,7 +3001,7 @@ def _stopped() -> bool: # drops rows SQLite is about to delete for us. self._db.endpoints.remove_by_job_ids(cur, [JobName.from_wire(str(job_id))]) cur.execute("DELETE FROM jobs WHERE job_id = ?", (job_id,)) - self._record_transaction(cur, "prune_old_data", [("job_pruned", str(job_id), {})]) + log_event("job_pruned", str(job_id)) jobs_deleted += 1 time.sleep(pause_between_s) @@ -3029,20 +3018,11 @@ def _stopped() -> bool: worker_id = row["worker_id"] with self._db.transaction() as cur: _remove_worker(cur, str(worker_id)) - self._record_transaction(cur, "prune_old_data", [("worker_pruned", str(worker_id), {})]) + log_event("worker_pruned", str(worker_id)) workers_deleted += 1 time.sleep(pause_between_s) - # 3. txn_actions: batch of 1000 per transaction (no CASCADE) - txn_actions_deleted = self._batch_delete( - "DELETE FROM txn_actions WHERE rowid IN " - "(SELECT rowid FROM txn_actions WHERE created_at_ms < ? LIMIT 1000)", - (txn_cutoff_ms,), - _stopped, - pause_between_s, - ) - - # 4. Task profiles: batch of 1000 per transaction + # 3. Task profiles: batch of 1000 per transaction profile_cutoff_ms = now_ms - profile_retention.to_ms() # 4a. Delete stale profiles by age. profiles_deleted = self._batch_delete( @@ -3066,15 +3046,13 @@ def _stopped() -> bool: result = PruneResult( jobs_deleted=jobs_deleted, workers_deleted=workers_deleted, - txn_actions_deleted=txn_actions_deleted, profiles_deleted=profiles_deleted, ) if result.total > 0: logger.info( - "Pruned old data: %d jobs, %d workers, %d txn_actions, %d profiles", + "Pruned old data: %d jobs, %d workers, %d profiles", result.jobs_deleted, result.workers_deleted, - result.txn_actions_deleted, result.profiles_deleted, ) self._db.optimize() @@ -3696,10 +3674,9 @@ def apply_direct_provider_updates(self, updates: list[TaskUpdate]) -> TxResult: cascaded_jobs.add(task.job_id) if tasks_to_kill or cascaded_jobs: - actions: list[tuple[str, str, dict[str, object]]] = [("direct_provider_updates_applied", "direct", {})] + log_event("direct_provider_updates_applied", "direct") for job_id in cascaded_jobs: - actions.append(("job_terminated", job_id.to_wire(), {})) - self._record_transaction(cur, "apply_direct_provider_updates", actions) + log_event("job_terminated", job_id.to_wire(), trigger="direct_provider_updates_applied") return TxResult(tasks_to_kill=tasks_to_kill, task_kill_workers=task_kill_workers) diff --git a/lib/iris/src/iris/rpc/controller.proto b/lib/iris/src/iris/rpc/controller.proto index c0eaf14c83..95eca419cf 100644 --- a/lib/iris/src/iris/rpc/controller.proto +++ b/lib/iris/src/iris/rpc/controller.proto @@ -311,22 +311,6 @@ message Controller { iris.vm.AutoscalerStatus status = 1; } - // --- Transactions / Actions Log --- - message TransactionAction { - iris.time.Timestamp timestamp = 1; - string action = 2; - string entity_id = 3; - string details = 4; - } - - message GetTransactionsRequest { - int32 limit = 1; // 0 = default (50) - } - - message GetTransactionsResponse { - repeated TransactionAction actions = 1; - } - // --- Checkpoint --- message BeginCheckpointRequest {} message BeginCheckpointResponse { @@ -594,9 +578,6 @@ service ControllerService { // Autoscaler rpc GetAutoscalerStatus(Controller.GetAutoscalerStatusRequest) returns (Controller.GetAutoscalerStatusResponse); - // Transactions / Actions Log - rpc GetTransactions(Controller.GetTransactionsRequest) returns (Controller.GetTransactionsResponse); - rpc ListUsers(Controller.ListUsersRequest) returns (Controller.ListUsersResponse); // DEPRECATED: use LogService.FetchLogs with regex patterns instead. diff --git a/lib/iris/src/iris/rpc/controller_connect.py b/lib/iris/src/iris/rpc/controller_connect.py index c5ec544e30..83606da5ec 100644 --- a/lib/iris/src/iris/rpc/controller_connect.py +++ b/lib/iris/src/iris/rpc/controller_connect.py @@ -58,9 +58,6 @@ async def list_endpoints(self, request: controller__pb2.Controller.ListEndpoints async def get_autoscaler_status(self, request: controller__pb2.Controller.GetAutoscalerStatusRequest, ctx: RequestContext) -> controller__pb2.Controller.GetAutoscalerStatusResponse: raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") - async def get_transactions(self, request: controller__pb2.Controller.GetTransactionsRequest, ctx: RequestContext) -> controller__pb2.Controller.GetTransactionsResponse: - raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") - async def list_users(self, request: controller__pb2.Controller.ListUsersRequest, ctx: RequestContext) -> controller__pb2.Controller.ListUsersResponse: raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") @@ -263,16 +260,6 @@ def __init__(self, service: ControllerService | AsyncGenerator[ControllerService ), function=svc.get_autoscaler_status, ), - "/iris.cluster.ControllerService/GetTransactions": Endpoint.unary( - method=MethodInfo( - name="GetTransactions", - service_name="iris.cluster.ControllerService", - input=controller__pb2.Controller.GetTransactionsRequest, - output=controller__pb2.Controller.GetTransactionsResponse, - idempotency_level=IdempotencyLevel.UNKNOWN, - ), - function=svc.get_transactions, - ), "/iris.cluster.ControllerService/ListUsers": Endpoint.unary( method=MethodInfo( name="ListUsers", @@ -766,26 +753,6 @@ async def get_autoscaler_status( timeout_ms=timeout_ms, ) - async def get_transactions( - self, - request: controller__pb2.Controller.GetTransactionsRequest, - *, - headers: Headers | Mapping[str, str] | None = None, - timeout_ms: int | None = None, - ) -> controller__pb2.Controller.GetTransactionsResponse: - return await self.execute_unary( - request=request, - method=MethodInfo( - name="GetTransactions", - service_name="iris.cluster.ControllerService", - input=controller__pb2.Controller.GetTransactionsRequest, - output=controller__pb2.Controller.GetTransactionsResponse, - idempotency_level=IdempotencyLevel.UNKNOWN, - ), - headers=headers, - timeout_ms=timeout_ms, - ) - async def list_users( self, request: controller__pb2.Controller.ListUsersRequest, @@ -1254,8 +1221,6 @@ def list_endpoints(self, request: controller__pb2.Controller.ListEndpointsReques raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") def get_autoscaler_status(self, request: controller__pb2.Controller.GetAutoscalerStatusRequest, ctx: RequestContext) -> controller__pb2.Controller.GetAutoscalerStatusResponse: raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") - def get_transactions(self, request: controller__pb2.Controller.GetTransactionsRequest, ctx: RequestContext) -> controller__pb2.Controller.GetTransactionsResponse: - raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") def list_users(self, request: controller__pb2.Controller.ListUsersRequest, ctx: RequestContext) -> controller__pb2.Controller.ListUsersResponse: raise ConnectError(Code.UNIMPLEMENTED, "Not implemented") def get_task_logs(self, request: controller__pb2.Controller.GetTaskLogsRequest, ctx: RequestContext) -> controller__pb2.Controller.GetTaskLogsResponse: @@ -1436,16 +1401,6 @@ def __init__(self, service: ControllerServiceSync, interceptors: Iterable[Interc ), function=service.get_autoscaler_status, ), - "/iris.cluster.ControllerService/GetTransactions": EndpointSync.unary( - method=MethodInfo( - name="GetTransactions", - service_name="iris.cluster.ControllerService", - input=controller__pb2.Controller.GetTransactionsRequest, - output=controller__pb2.Controller.GetTransactionsResponse, - idempotency_level=IdempotencyLevel.UNKNOWN, - ), - function=service.get_transactions, - ), "/iris.cluster.ControllerService/ListUsers": EndpointSync.unary( method=MethodInfo( name="ListUsers", @@ -1939,26 +1894,6 @@ def get_autoscaler_status( timeout_ms=timeout_ms, ) - def get_transactions( - self, - request: controller__pb2.Controller.GetTransactionsRequest, - *, - headers: Headers | Mapping[str, str] | None = None, - timeout_ms: int | None = None, - ) -> controller__pb2.Controller.GetTransactionsResponse: - return self.execute_unary( - request=request, - method=MethodInfo( - name="GetTransactions", - service_name="iris.cluster.ControllerService", - input=controller__pb2.Controller.GetTransactionsRequest, - output=controller__pb2.Controller.GetTransactionsResponse, - idempotency_level=IdempotencyLevel.UNKNOWN, - ), - headers=headers, - timeout_ms=timeout_ms, - ) - def list_users( self, request: controller__pb2.Controller.ListUsersRequest, diff --git a/lib/iris/src/iris/rpc/controller_pb2.py b/lib/iris/src/iris/rpc/controller_pb2.py index e8bfaf1e36..6afa6bf537 100644 --- a/lib/iris/src/iris/rpc/controller_pb2.py +++ b/lib/iris/src/iris/rpc/controller_pb2.py @@ -29,7 +29,7 @@ from . import vm_pb2 as vm__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10\x63ontroller.proto\x12\x0ciris.cluster\x1a\tjob.proto\x1a\rlogging.proto\x1a\x0bquery.proto\x1a\ntime.proto\x1a\x08vm.proto\"\xb8Z\n\nController\x1a\xc7\x08\n\x10LaunchJobRequest\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12;\n\nentrypoint\x18\x02 \x01(\x0b\x32\x1b.iris.job.RuntimeEntrypointR\nentrypoint\x12\x39\n\tresources\x18\x03 \x01(\x0b\x32\x1b.iris.job.ResourceSpecProtoR\tresources\x12=\n\x0b\x65nvironment\x18\x04 \x01(\x0b\x32\x1b.iris.job.EnvironmentConfigR\x0b\x65nvironment\x12\x1b\n\tbundle_id\x18\x05 \x01(\tR\x08\x62undleId\x12\x1f\n\x0b\x62undle_blob\x18\x06 \x01(\x0cR\nbundleBlob\x12\x42\n\x12scheduling_timeout\x18\x08 \x01(\x0b\x32\x13.iris.time.DurationR\x11schedulingTimeout\x12\x14\n\x05ports\x18\t \x03(\tR\x05ports\x12*\n\x11max_task_failures\x18\x0b \x01(\x05R\x0fmaxTaskFailures\x12.\n\x13max_retries_failure\x18\x0c \x01(\x05R\x11maxRetriesFailure\x12\x34\n\x16max_retries_preemption\x18\r \x01(\x05R\x14maxRetriesPreemption\x12\x36\n\x0b\x63onstraints\x18\x0e \x03(\x0b\x32\x14.iris.job.ConstraintR\x0b\x63onstraints\x12@\n\x0c\x63oscheduling\x18\x0f \x01(\x0b\x32\x1c.iris.job.CoschedulingConfigR\x0c\x63oscheduling\x12\x1a\n\x08replicas\x18\x14 \x01(\x05R\x08replicas\x12-\n\x07timeout\x18\x15 \x01(\x0b\x32\x13.iris.time.DurationR\x07timeout\x12$\n\x0e\x66\x61il_if_exists\x18\x16 \x01(\x08R\x0c\x66\x61ilIfExists\x12=\n\x0breservation\x18\x1e \x01(\x0b\x32\x1b.iris.job.ReservationConfigR\x0breservation\x12J\n\x11preemption_policy\x18\x1f \x01(\x0e\x32\x1d.iris.job.JobPreemptionPolicyR\x10preemptionPolicy\x12K\n\x13\x65xisting_job_policy\x18 \x01(\x0e\x32\x1b.iris.job.ExistingJobPolicyR\x11\x65xistingJobPolicy\x12;\n\rpriority_band\x18! \x01(\x0e\x32\x16.iris.job.PriorityBandR\x0cpriorityBand\x12\x1d\n\ntask_image\x18\" \x01(\tR\ttaskImage\x12\x1f\n\x0bsubmit_argv\x18# \x03(\tR\nsubmitArgv\x1a*\n\x11LaunchJobResponse\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a\x38\n\x13GetJobStatusRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobIdJ\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04\x1a\xfa\x01\n\x14GetJobStatusResponse\x12%\n\x03job\x18\x01 \x01(\x0b\x32\x13.iris.job.JobStatusR\x03job\x12\x43\n\x07request\x18\x02 \x01(\x0b\x32).iris.cluster.Controller.LaunchJobRequestR\x07request\x12:\n\x0cresource_min\x18\x03 \x01(\x0b\x32\x17.iris.job.ResourceUsageR\x0bresourceMin\x12:\n\x0cresource_max\x18\x04 \x01(\x0b\x32\x17.iris.job.ResourceUsageR\x0bresourceMax\x1a-\n\x12GetJobStateRequest\x12\x17\n\x07job_ids\x18\x01 \x03(\tR\x06jobIds\x1a\xb6\x01\n\x13GetJobStateResponse\x12P\n\x06states\x18\x01 \x03(\x0b\x32\x38.iris.cluster.Controller.GetJobStateResponse.StatesEntryR\x06states\x1aM\n\x0bStatesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12(\n\x05value\x18\x02 \x01(\x0e\x32\x12.iris.job.JobStateR\x05value:\x02\x38\x01\x1a,\n\x13TerminateJobRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a\xf3\x02\n\x08JobQuery\x12<\n\x05scope\x18\x01 \x01(\x0e\x32&.iris.cluster.Controller.JobQueryScopeR\x05scope\x12\"\n\rparent_job_id\x18\x02 \x01(\tR\x0bparentJobId\x12\x1f\n\x0bname_filter\x18\x03 \x01(\tR\nnameFilter\x12!\n\x0cstate_filter\x18\x04 \x01(\tR\x0bstateFilter\x12\x44\n\nsort_field\x18\x05 \x01(\x0e\x32%.iris.cluster.Controller.JobSortFieldR\tsortField\x12M\n\x0esort_direction\x18\x06 \x01(\x0e\x32&.iris.cluster.Controller.SortDirectionR\rsortDirection\x12\x16\n\x06offset\x18\x07 \x01(\x05R\x06offset\x12\x14\n\x05limit\x18\x08 \x01(\x05R\x05limit\x1a\xc9\x01\n\x0fListJobsRequest\x12\x37\n\x05query\x18\t \x01(\x0b\x32!.iris.cluster.Controller.JobQueryR\x05queryJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x07\x10\x08R\x06offsetR\x05limitR\nsort_fieldR\x0esort_directionR\x0bname_filterR\x0cstate_filterR\rparent_job_id\x1aw\n\x10ListJobsResponse\x12\'\n\x04jobs\x18\x01 \x03(\x0b\x32\x13.iris.job.JobStatusR\x04jobs\x12\x1f\n\x0btotal_count\x18\x02 \x01(\x05R\ntotalCount\x12\x19\n\x08has_more\x18\x03 \x01(\x08R\x07hasMore\x1a/\n\x14GetTaskStatusRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x1a\x83\x01\n\x15GetTaskStatusResponse\x12(\n\x04task\x18\x01 \x01(\x0b\x32\x14.iris.job.TaskStatusR\x04task\x12@\n\rjob_resources\x18\x02 \x01(\x0b\x32\x1b.iris.job.ResourceSpecProtoR\x0cjobResources\x1a)\n\x10ListTasksRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a?\n\x11ListTasksResponse\x12*\n\x05tasks\x18\x01 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x05tasks\x1at\n\x16\x45xecInContainerRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x18\n\x07\x63ommand\x18\x02 \x03(\tR\x07\x63ommand\x12\'\n\x0ftimeout_seconds\x18\x03 \x01(\x05R\x0etimeoutSeconds\x1a|\n\x17\x45xecInContainerResponse\x12\x1b\n\texit_code\x18\x01 \x01(\x05R\x08\x65xitCode\x12\x16\n\x06stdout\x18\x02 \x01(\tR\x06stdout\x12\x16\n\x06stderr\x18\x03 \x01(\tR\x06stderr\x12\x14\n\x05\x65rror\x18\x04 \x01(\tR\x05\x65rror\x1a\xb4\x01\n\nWorkerInfo\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x18\n\x07\x61\x64\x64ress\x18\x02 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x03 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12\x39\n\rregistered_at\x18\x04 \x01(\x0b\x32\x14.iris.time.TimestampR\x0cregisteredAt\x1a\xda\x02\n\x12WorkerHealthStatus\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x18\n\x07healthy\x18\x02 \x01(\x08R\x07healthy\x12\x31\n\x14\x63onsecutive_failures\x18\x03 \x01(\x05R\x13\x63onsecutiveFailures\x12;\n\x0elast_heartbeat\x18\x04 \x01(\x0b\x32\x14.iris.time.TimestampR\rlastHeartbeat\x12&\n\x0frunning_job_ids\x18\x05 \x03(\tR\rrunningJobIds\x12\x18\n\x07\x61\x64\x64ress\x18\x06 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x07 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12%\n\x0estatus_message\x18\x08 \x01(\tR\rstatusMessage\x1a\x14\n\x12ListWorkersRequest\x1a\\\n\x13ListWorkersResponse\x12\x45\n\x07workers\x18\x01 \x03(\x0b\x32+.iris.cluster.Controller.WorkerHealthStatusR\x07workers\x1a\xba\x01\n\x0fRegisterRequest\x12\x18\n\x07\x61\x64\x64ress\x18\x01 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x02 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12\x1b\n\tworker_id\x18\x03 \x01(\tR\x08workerId\x12\x19\n\x08slice_id\x18\x04 \x01(\tR\x07sliceId\x12\x1f\n\x0bscale_group\x18\x05 \x01(\tR\nscaleGroup\x1aK\n\x10RegisterResponse\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x02 \x01(\x08R\x08\x61\x63\x63\x65pted\x1a\xfc\x01\n\x08\x45ndpoint\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x18\n\x07\x61\x64\x64ress\x18\x03 \x01(\tR\x07\x61\x64\x64ress\x12\x17\n\x07task_id\x18\x04 \x01(\tR\x06taskId\x12K\n\x08metadata\x18\x05 \x03(\x0b\x32/.iris.cluster.Controller.Endpoint.MetadataEntryR\x08metadata\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\xb9\x02\n\x17RegisterEndpointRequest\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x18\n\x07\x61\x64\x64ress\x18\x02 \x01(\tR\x07\x61\x64\x64ress\x12\x17\n\x07task_id\x18\x03 \x01(\tR\x06taskId\x12Z\n\x08metadata\x18\x04 \x03(\x0b\x32>.iris.cluster.Controller.RegisterEndpointRequest.MetadataEntryR\x08metadata\x12\x1d\n\nattempt_id\x18\x05 \x01(\x05R\tattemptId\x12\x1f\n\x0b\x65ndpoint_id\x18\x06 \x01(\tR\nendpointId\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a;\n\x18RegisterEndpointResponse\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x1a<\n\x19UnregisterEndpointRequest\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x1a\x44\n\x14ListEndpointsRequest\x12\x16\n\x06prefix\x18\x01 \x01(\tR\x06prefix\x12\x14\n\x05\x65xact\x18\x02 \x01(\x08R\x05\x65xact\x1aX\n\x15ListEndpointsResponse\x12?\n\tendpoints\x18\x01 \x03(\x0b\x32!.iris.cluster.Controller.EndpointR\tendpoints\x1a\x1c\n\x1aGetAutoscalerStatusRequest\x1aP\n\x1bGetAutoscalerStatusResponse\x12\x31\n\x06status\x18\x01 \x01(\x0b\x32\x19.iris.vm.AutoscalerStatusR\x06status\x1a\x96\x01\n\x11TransactionAction\x12\x32\n\ttimestamp\x18\x01 \x01(\x0b\x32\x14.iris.time.TimestampR\ttimestamp\x12\x16\n\x06\x61\x63tion\x18\x02 \x01(\tR\x06\x61\x63tion\x12\x1b\n\tentity_id\x18\x03 \x01(\tR\x08\x65ntityId\x12\x18\n\x07\x64\x65tails\x18\x04 \x01(\tR\x07\x64\x65tails\x1a.\n\x16GetTransactionsRequest\x12\x14\n\x05limit\x18\x01 \x01(\x05R\x05limit\x1a_\n\x17GetTransactionsResponse\x12\x44\n\x07\x61\x63tions\x18\x01 \x03(\x0b\x32*.iris.cluster.Controller.TransactionActionR\x07\x61\x63tions\x1a\x18\n\x16\x42\x65ginCheckpointRequest\x1a\xd6\x01\n\x17\x42\x65ginCheckpointResponse\x12\'\n\x0f\x63heckpoint_path\x18\x01 \x01(\tR\x0e\x63heckpointPath\x12\x33\n\ncreated_at\x18\x02 \x01(\x0b\x32\x14.iris.time.TimestampR\tcreatedAt\x12\x1b\n\tjob_count\x18\x03 \x01(\x05R\x08jobCount\x12\x1d\n\ntask_count\x18\x04 \x01(\x05R\ttaskCount\x12!\n\x0cworker_count\x18\x05 \x01(\x05R\x0bworkerCount\x1a\xf3\x02\n\x0bUserSummary\x12\x12\n\x04user\x18\x01 \x01(\tR\x04user\x12\x65\n\x11task_state_counts\x18\x02 \x03(\x0b\x32\x39.iris.cluster.Controller.UserSummary.TaskStateCountsEntryR\x0ftaskStateCounts\x12\x62\n\x10job_state_counts\x18\x03 \x03(\x0b\x32\x38.iris.cluster.Controller.UserSummary.JobStateCountsEntryR\x0ejobStateCounts\x1a\x42\n\x14TaskStateCountsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x05R\x05value:\x02\x38\x01\x1a\x41\n\x13JobStateCountsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x05R\x05value:\x02\x38\x01\x1a\x12\n\x10ListUsersRequest\x1aO\n\x11ListUsersResponse\x12:\n\x05users\x18\x01 \x03(\x0b\x32$.iris.cluster.Controller.UserSummaryR\x05users\x1a\x98\x02\n\x12GetTaskLogsRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12)\n\x10include_children\x18\x02 \x01(\x08R\x0fincludeChildren\x12\x19\n\x08since_ms\x18\x03 \x01(\x03R\x07sinceMs\x12&\n\x0fmax_total_lines\x18\x04 \x01(\x03R\rmaxTotalLines\x12\x1c\n\tsubstring\x18\x05 \x01(\tR\tsubstring\x12\x1d\n\nattempt_id\x18\x06 \x01(\x05R\tattemptId\x12\x1b\n\tmin_level\x18\x07 \x01(\tR\x08minLevel\x12\x16\n\x06\x63ursor\x18\x08 \x01(\x03R\x06\x63ursor\x12\x12\n\x04tail\x18\t \x01(\x08R\x04tail\x1a\x86\x01\n\x0cTaskLogBatch\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12*\n\x04logs\x18\x02 \x03(\x0b\x32\x16.iris.logging.LogEntryR\x04logs\x12\x14\n\x05\x65rror\x18\x03 \x01(\tR\x05\x65rror\x12\x1b\n\tworker_id\x18\x04 \x01(\tR\x08workerId\x1a\xd2\x01\n\x13GetTaskLogsResponse\x12\x42\n\ttask_logs\x18\x01 \x03(\x0b\x32%.iris.cluster.Controller.TaskLogBatchR\x08taskLogs\x12\x1c\n\ttruncated\x18\x02 \x01(\x08R\ttruncated\x12\x41\n\x12\x63hild_job_statuses\x18\x03 \x03(\x0b\x32\x13.iris.job.JobStatusR\x10\x63hildJobStatuses\x12\x16\n\x06\x63ursor\x18\x04 \x01(\x03R\x06\x63ursor\x1a(\n\x16GetWorkerStatusRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x1a\xe8\x03\n\x17GetWorkerStatusResponse\x12\x1f\n\x02vm\x18\x01 \x01(\x0b\x32\x0f.iris.vm.VmInfoR\x02vm\x12\x1f\n\x0bscale_group\x18\x02 \x01(\tR\nscaleGroup\x12\x43\n\x06worker\x18\x03 \x01(\x0b\x32+.iris.cluster.Controller.WorkerHealthStatusR\x06worker\x12%\n\x0e\x62ootstrap_logs\x18\x04 \x01(\tR\rbootstrapLogs\x12\x44\n\x12worker_log_entries\x18\t \x03(\x0b\x32\x16.iris.logging.LogEntryR\x10workerLogEntries\x12\x37\n\x0crecent_tasks\x18\x06 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x0brecentTasks\x12M\n\x11\x63urrent_resources\x18\x07 \x01(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x10\x63urrentResources\x12K\n\x10resource_history\x18\x08 \x03(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x0fresourceHistoryJ\x04\x08\x05\x10\x06\x1a\xce\x01\n\x0fSchedulingEvent\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x1d\n\nattempt_id\x18\x02 \x01(\x05R\tattemptId\x12\x1d\n\nevent_type\x18\x03 \x01(\tR\teventType\x12\x16\n\x06reason\x18\x04 \x01(\tR\x06reason\x12\x18\n\x07message\x18\x05 \x01(\tR\x07message\x12\x32\n\ttimestamp\x18\x06 \x01(\x0b\x32\x14.iris.time.TimestampR\ttimestamp\x1a\x8e\x02\n\x0f\x43lusterCapacity\x12+\n\x11schedulable_nodes\x18\x01 \x01(\x05R\x10schedulableNodes\x12\x30\n\x14total_cpu_millicores\x18\x02 \x01(\x03R\x12totalCpuMillicores\x12\x38\n\x18\x61vailable_cpu_millicores\x18\x03 \x01(\x03R\x16\x61vailableCpuMillicores\x12,\n\x12total_memory_bytes\x18\x04 \x01(\x03R\x10totalMemoryBytes\x12\x34\n\x16\x61vailable_memory_bytes\x18\x05 \x01(\x03R\x14\x61vailableMemoryBytes\x1a\x1a\n\x18GetProviderStatusRequest\x1a\xe8\x01\n\x19GetProviderStatusResponse\x12.\n\x13has_direct_provider\x18\x01 \x01(\x08R\x11hasDirectProvider\x12U\n\x11scheduling_events\x18\x02 \x03(\x0b\x32(.iris.cluster.Controller.SchedulingEventR\x10schedulingEvents\x12\x44\n\x08\x63\x61pacity\x18\x03 \x01(\x0b\x32(.iris.cluster.Controller.ClusterCapacityR\x08\x63\x61pacity\x1a#\n!GetKubernetesClusterStatusRequest\x1a\xed\x01\n\x13KubernetesPodStatus\x12\x19\n\x08pod_name\x18\x01 \x01(\tR\x07podName\x12\x17\n\x07task_id\x18\x02 \x01(\tR\x06taskId\x12\x14\n\x05phase\x18\x03 \x01(\tR\x05phase\x12\x16\n\x06reason\x18\x04 \x01(\tR\x06reason\x12\x18\n\x07message\x18\x05 \x01(\tR\x07message\x12=\n\x0flast_transition\x18\x06 \x01(\x0b\x32\x14.iris.time.TimestampR\x0elastTransition\x12\x1b\n\tnode_name\x18\x07 \x01(\tR\x08nodeName\x1a\x8f\x03\n\x0eNodePoolStatus\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12#\n\rinstance_type\x18\x02 \x01(\tR\x0cinstanceType\x12\x1f\n\x0bscale_group\x18\x03 \x01(\tR\nscaleGroup\x12!\n\x0ctarget_nodes\x18\x04 \x01(\x05R\x0btargetNodes\x12#\n\rcurrent_nodes\x18\x05 \x01(\x05R\x0c\x63urrentNodes\x12!\n\x0cqueued_nodes\x18\x06 \x01(\x05R\x0bqueuedNodes\x12*\n\x11in_progress_nodes\x18\x07 \x01(\x05R\x0finProgressNodes\x12 \n\x0b\x61utoscaling\x18\x08 \x01(\x08R\x0b\x61utoscaling\x12\x1b\n\tmin_nodes\x18\t \x01(\x05R\x08minNodes\x12\x1b\n\tmax_nodes\x18\n \x01(\x05R\x08maxNodes\x12\x1a\n\x08\x63\x61pacity\x18\x0b \x01(\tR\x08\x63\x61pacity\x12\x14\n\x05quota\x18\x0c \x01(\tR\x05quota\x1a\xac\x03\n\"GetKubernetesClusterStatusResponse\x12\x1c\n\tnamespace\x18\x01 \x01(\tR\tnamespace\x12\x1f\n\x0btotal_nodes\x18\x02 \x01(\x05R\ntotalNodes\x12+\n\x11schedulable_nodes\x18\x03 \x01(\x05R\x10schedulableNodes\x12\'\n\x0f\x61llocatable_cpu\x18\x04 \x01(\tR\x0e\x61llocatableCpu\x12-\n\x12\x61llocatable_memory\x18\x05 \x01(\tR\x11\x61llocatableMemory\x12O\n\x0cpod_statuses\x18\x06 \x03(\x0b\x32,.iris.cluster.Controller.KubernetesPodStatusR\x0bpodStatuses\x12)\n\x10provider_version\x18\x07 \x01(\tR\x0fproviderVersion\x12\x46\n\nnode_pools\x18\x08 \x03(\x0b\x32\'.iris.cluster.Controller.NodePoolStatusR\tnodePools\x1a\x33\n\x14RestartWorkerRequest\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x1aI\n\x15RestartWorkerResponse\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08R\x08\x61\x63\x63\x65pted\x12\x14\n\x05\x65rror\x18\x02 \x01(\tR\x05\x65rror\x1a\x85\x01\n\x14SetUserBudgetRequest\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12\x31\n\x08max_band\x18\x03 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x1a\x17\n\x15SetUserBudgetResponse\x1a/\n\x14GetUserBudgetRequest\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x1a\xa9\x01\n\x15GetUserBudgetResponse\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12!\n\x0c\x62udget_spent\x18\x03 \x01(\x03R\x0b\x62udgetSpent\x12\x31\n\x08max_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x1a\x18\n\x16ListUserBudgetsRequest\x1a_\n\x17ListUserBudgetsResponse\x12\x44\n\x05users\x18\x01 \x03(\x0b\x32..iris.cluster.Controller.GetUserBudgetResponseR\x05users\x1al\n\x17UpdateTaskStatusRequest\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x34\n\x07updates\x18\x02 \x03(\x0b\x32\x1a.iris.job.WorkerTaskStatusR\x07updates\x1a\x1a\n\x18UpdateTaskStatusResponse\x1a\x1a\n\x18GetSchedulerStateRequest\x1a\xa7\x02\n\x12SchedulerTaskEntry\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x15\n\x06job_id\x18\x02 \x01(\tR\x05jobId\x12\x17\n\x07user_id\x18\x03 \x01(\tR\x06userId\x12;\n\roriginal_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x0coriginalBand\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12%\n\x0equeue_position\x18\x06 \x01(\x05R\rqueuePosition\x12%\n\x0eresource_value\x18\x07 \x01(\x05R\rresourceValue\x1a\xa7\x01\n\x12SchedulerBandGroup\x12*\n\x04\x62\x61nd\x18\x01 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x04\x62\x61nd\x12\x41\n\x05tasks\x18\x02 \x03(\x0b\x32+.iris.cluster.Controller.SchedulerTaskEntryR\x05tasks\x12\"\n\rtotal_in_band\x18\x03 \x01(\x05R\x0btotalInBand\x1a\x97\x02\n\x13SchedulerUserBudget\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12!\n\x0c\x62udget_spent\x18\x03 \x01(\x03R\x0b\x62udgetSpent\x12\x31\n\x08max_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12/\n\x13utilization_percent\x18\x06 \x01(\x02R\x12utilizationPercent\x1a\xea\x02\n\x14SchedulerRunningTask\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x15\n\x06job_id\x18\x02 \x01(\tR\x05jobId\x12\x17\n\x07user_id\x18\x03 \x01(\tR\x06userId\x12\x1b\n\tworker_id\x18\x04 \x01(\tR\x08workerId\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12%\n\x0eresource_value\x18\x06 \x01(\x05R\rresourceValue\x12 \n\x0bpreemptible\x18\x07 \x01(\x08R\x0bpreemptible\x12=\n\x0epreemptible_by\x18\x08 \x03(\x0e\x32\x16.iris.job.PriorityBandR\rpreemptibleBy\x12%\n\x0eis_coscheduled\x18\t \x01(\x08R\risCoscheduled\x1a\xdc\x02\n\x19GetSchedulerStateResponse\x12P\n\rpending_queue\x18\x01 \x03(\x0b\x32+.iris.cluster.Controller.SchedulerBandGroupR\x0cpendingQueue\x12O\n\x0cuser_budgets\x18\x02 \x03(\x0b\x32,.iris.cluster.Controller.SchedulerUserBudgetR\x0buserBudgets\x12R\n\rrunning_tasks\x18\x03 \x03(\x0b\x32-.iris.cluster.Controller.SchedulerRunningTaskR\x0crunningTasks\x12#\n\rtotal_pending\x18\x04 \x01(\x05R\x0ctotalPending\x12#\n\rtotal_running\x18\x05 \x01(\x05R\x0ctotalRunning\"\xb7\x01\n\x0cJobSortField\x12\x1e\n\x1aJOB_SORT_FIELD_UNSPECIFIED\x10\x00\x12\x17\n\x13JOB_SORT_FIELD_DATE\x10\x01\x12\x17\n\x13JOB_SORT_FIELD_NAME\x10\x02\x12\x18\n\x14JOB_SORT_FIELD_STATE\x10\x03\x12\x1b\n\x17JOB_SORT_FIELD_FAILURES\x10\x04\x12\x1e\n\x1aJOB_SORT_FIELD_PREEMPTIONS\x10\x05\"`\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x16\n\x12SORT_DIRECTION_ASC\x10\x01\x12\x17\n\x13SORT_DIRECTION_DESC\x10\x02\"\x82\x01\n\rJobQueryScope\x12\x1f\n\x1bJOB_QUERY_SCOPE_UNSPECIFIED\x10\x00\x12\x17\n\x13JOB_QUERY_SCOPE_ALL\x10\x01\x12\x19\n\x15JOB_QUERY_SCOPE_ROOTS\x10\x02\x12\x1c\n\x18JOB_QUERY_SCOPE_CHILDREN\x10\x03\x32\x84\x1d\n\x11\x43ontrollerService\x12\x62\n\tLaunchJob\x12).iris.cluster.Controller.LaunchJobRequest\x1a*.iris.cluster.Controller.LaunchJobResponse\x12k\n\x0cGetJobStatus\x12,.iris.cluster.Controller.GetJobStatusRequest\x1a-.iris.cluster.Controller.GetJobStatusResponse\x12h\n\x0bGetJobState\x12+.iris.cluster.Controller.GetJobStateRequest\x1a,.iris.cluster.Controller.GetJobStateResponse\x12M\n\x0cTerminateJob\x12,.iris.cluster.Controller.TerminateJobRequest\x1a\x0f.iris.job.Empty\x12_\n\x08ListJobs\x12(.iris.cluster.Controller.ListJobsRequest\x1a).iris.cluster.Controller.ListJobsResponse\x12n\n\rGetTaskStatus\x12-.iris.cluster.Controller.GetTaskStatusRequest\x1a..iris.cluster.Controller.GetTaskStatusResponse\x12\x62\n\tListTasks\x12).iris.cluster.Controller.ListTasksRequest\x1a*.iris.cluster.Controller.ListTasksResponse\x12_\n\x08Register\x12(.iris.cluster.Controller.RegisterRequest\x1a).iris.cluster.Controller.RegisterResponse\x12h\n\x0bListWorkers\x12+.iris.cluster.Controller.ListWorkersRequest\x1a,.iris.cluster.Controller.ListWorkersResponse\x12w\n\x10RegisterEndpoint\x12\x30.iris.cluster.Controller.RegisterEndpointRequest\x1a\x31.iris.cluster.Controller.RegisterEndpointResponse\x12Y\n\x12UnregisterEndpoint\x12\x32.iris.cluster.Controller.UnregisterEndpointRequest\x1a\x0f.iris.job.Empty\x12n\n\rListEndpoints\x12-.iris.cluster.Controller.ListEndpointsRequest\x1a..iris.cluster.Controller.ListEndpointsResponse\x12\x80\x01\n\x13GetAutoscalerStatus\x12\x33.iris.cluster.Controller.GetAutoscalerStatusRequest\x1a\x34.iris.cluster.Controller.GetAutoscalerStatusResponse\x12t\n\x0fGetTransactions\x12/.iris.cluster.Controller.GetTransactionsRequest\x1a\x30.iris.cluster.Controller.GetTransactionsResponse\x12\x62\n\tListUsers\x12).iris.cluster.Controller.ListUsersRequest\x1a*.iris.cluster.Controller.ListUsersResponse\x12h\n\x0bGetTaskLogs\x12+.iris.cluster.Controller.GetTaskLogsRequest\x1a,.iris.cluster.Controller.GetTaskLogsResponse\x12J\n\x0bProfileTask\x12\x1c.iris.job.ProfileTaskRequest\x1a\x1d.iris.job.ProfileTaskResponse\x12t\n\x0f\x45xecInContainer\x12/.iris.cluster.Controller.ExecInContainerRequest\x1a\x30.iris.cluster.Controller.ExecInContainerResponse\x12t\n\x0fGetWorkerStatus\x12/.iris.cluster.Controller.GetWorkerStatusRequest\x1a\x30.iris.cluster.Controller.GetWorkerStatusResponse\x12t\n\x0f\x42\x65ginCheckpoint\x12/.iris.cluster.Controller.BeginCheckpointRequest\x1a\x30.iris.cluster.Controller.BeginCheckpointResponse\x12Y\n\x10GetProcessStatus\x12!.iris.job.GetProcessStatusRequest\x1a\".iris.job.GetProcessStatusResponse\x12J\n\x0bGetAuthInfo\x12\x1c.iris.job.GetAuthInfoRequest\x1a\x1d.iris.job.GetAuthInfoResponse\x12\x38\n\x05Login\x12\x16.iris.job.LoginRequest\x1a\x17.iris.job.LoginResponse\x12M\n\x0c\x43reateApiKey\x12\x1d.iris.job.CreateApiKeyRequest\x1a\x1e.iris.job.CreateApiKeyResponse\x12>\n\x0cRevokeApiKey\x12\x1d.iris.job.RevokeApiKeyRequest\x1a\x0f.iris.job.Empty\x12J\n\x0bListApiKeys\x12\x1c.iris.job.ListApiKeysRequest\x1a\x1d.iris.job.ListApiKeysResponse\x12S\n\x0eGetCurrentUser\x12\x1f.iris.job.GetCurrentUserRequest\x1a .iris.job.GetCurrentUserResponse\x12z\n\x11GetProviderStatus\x12\x31.iris.cluster.Controller.GetProviderStatusRequest\x1a\x32.iris.cluster.Controller.GetProviderStatusResponse\x12\x95\x01\n\x1aGetKubernetesClusterStatus\x12:.iris.cluster.Controller.GetKubernetesClusterStatusRequest\x1a;.iris.cluster.Controller.GetKubernetesClusterStatusResponse\x12L\n\x0f\x45xecuteRawQuery\x12\x1b.iris.query.RawQueryRequest\x1a\x1c.iris.query.RawQueryResponse\x12n\n\rRestartWorker\x12-.iris.cluster.Controller.RestartWorkerRequest\x1a..iris.cluster.Controller.RestartWorkerResponse\x12n\n\rSetUserBudget\x12-.iris.cluster.Controller.SetUserBudgetRequest\x1a..iris.cluster.Controller.SetUserBudgetResponse\x12n\n\rGetUserBudget\x12-.iris.cluster.Controller.GetUserBudgetRequest\x1a..iris.cluster.Controller.GetUserBudgetResponse\x12t\n\x0fListUserBudgets\x12/.iris.cluster.Controller.ListUserBudgetsRequest\x1a\x30.iris.cluster.Controller.ListUserBudgetsResponse\x12z\n\x11GetSchedulerState\x12\x31.iris.cluster.Controller.GetSchedulerStateRequest\x1a\x32.iris.cluster.Controller.GetSchedulerStateResponse\x12w\n\x10UpdateTaskStatus\x12\x30.iris.cluster.Controller.UpdateTaskStatusRequest\x1a\x31.iris.cluster.Controller.UpdateTaskStatusResponseBt\n\x10\x63om.iris.clusterB\x0f\x43ontrollerProtoP\x01\xa2\x02\x03ICX\xaa\x02\x0cIris.Cluster\xca\x02\x0cIris\\Cluster\xe2\x02\x18Iris\\Cluster\\GPBMetadata\xea\x02\rIris::Clusterb\x08\x65\x64itionsp\xe8\x07') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10\x63ontroller.proto\x12\x0ciris.cluster\x1a\tjob.proto\x1a\rlogging.proto\x1a\x0bquery.proto\x1a\ntime.proto\x1a\x08vm.proto\"\x8eX\n\nController\x1a\xc7\x08\n\x10LaunchJobRequest\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12;\n\nentrypoint\x18\x02 \x01(\x0b\x32\x1b.iris.job.RuntimeEntrypointR\nentrypoint\x12\x39\n\tresources\x18\x03 \x01(\x0b\x32\x1b.iris.job.ResourceSpecProtoR\tresources\x12=\n\x0b\x65nvironment\x18\x04 \x01(\x0b\x32\x1b.iris.job.EnvironmentConfigR\x0b\x65nvironment\x12\x1b\n\tbundle_id\x18\x05 \x01(\tR\x08\x62undleId\x12\x1f\n\x0b\x62undle_blob\x18\x06 \x01(\x0cR\nbundleBlob\x12\x42\n\x12scheduling_timeout\x18\x08 \x01(\x0b\x32\x13.iris.time.DurationR\x11schedulingTimeout\x12\x14\n\x05ports\x18\t \x03(\tR\x05ports\x12*\n\x11max_task_failures\x18\x0b \x01(\x05R\x0fmaxTaskFailures\x12.\n\x13max_retries_failure\x18\x0c \x01(\x05R\x11maxRetriesFailure\x12\x34\n\x16max_retries_preemption\x18\r \x01(\x05R\x14maxRetriesPreemption\x12\x36\n\x0b\x63onstraints\x18\x0e \x03(\x0b\x32\x14.iris.job.ConstraintR\x0b\x63onstraints\x12@\n\x0c\x63oscheduling\x18\x0f \x01(\x0b\x32\x1c.iris.job.CoschedulingConfigR\x0c\x63oscheduling\x12\x1a\n\x08replicas\x18\x14 \x01(\x05R\x08replicas\x12-\n\x07timeout\x18\x15 \x01(\x0b\x32\x13.iris.time.DurationR\x07timeout\x12$\n\x0e\x66\x61il_if_exists\x18\x16 \x01(\x08R\x0c\x66\x61ilIfExists\x12=\n\x0breservation\x18\x1e \x01(\x0b\x32\x1b.iris.job.ReservationConfigR\x0breservation\x12J\n\x11preemption_policy\x18\x1f \x01(\x0e\x32\x1d.iris.job.JobPreemptionPolicyR\x10preemptionPolicy\x12K\n\x13\x65xisting_job_policy\x18 \x01(\x0e\x32\x1b.iris.job.ExistingJobPolicyR\x11\x65xistingJobPolicy\x12;\n\rpriority_band\x18! \x01(\x0e\x32\x16.iris.job.PriorityBandR\x0cpriorityBand\x12\x1d\n\ntask_image\x18\" \x01(\tR\ttaskImage\x12\x1f\n\x0bsubmit_argv\x18# \x03(\tR\nsubmitArgv\x1a*\n\x11LaunchJobResponse\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a\x38\n\x13GetJobStatusRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobIdJ\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04\x1a\xfa\x01\n\x14GetJobStatusResponse\x12%\n\x03job\x18\x01 \x01(\x0b\x32\x13.iris.job.JobStatusR\x03job\x12\x43\n\x07request\x18\x02 \x01(\x0b\x32).iris.cluster.Controller.LaunchJobRequestR\x07request\x12:\n\x0cresource_min\x18\x03 \x01(\x0b\x32\x17.iris.job.ResourceUsageR\x0bresourceMin\x12:\n\x0cresource_max\x18\x04 \x01(\x0b\x32\x17.iris.job.ResourceUsageR\x0bresourceMax\x1a-\n\x12GetJobStateRequest\x12\x17\n\x07job_ids\x18\x01 \x03(\tR\x06jobIds\x1a\xb6\x01\n\x13GetJobStateResponse\x12P\n\x06states\x18\x01 \x03(\x0b\x32\x38.iris.cluster.Controller.GetJobStateResponse.StatesEntryR\x06states\x1aM\n\x0bStatesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12(\n\x05value\x18\x02 \x01(\x0e\x32\x12.iris.job.JobStateR\x05value:\x02\x38\x01\x1a,\n\x13TerminateJobRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a\xf3\x02\n\x08JobQuery\x12<\n\x05scope\x18\x01 \x01(\x0e\x32&.iris.cluster.Controller.JobQueryScopeR\x05scope\x12\"\n\rparent_job_id\x18\x02 \x01(\tR\x0bparentJobId\x12\x1f\n\x0bname_filter\x18\x03 \x01(\tR\nnameFilter\x12!\n\x0cstate_filter\x18\x04 \x01(\tR\x0bstateFilter\x12\x44\n\nsort_field\x18\x05 \x01(\x0e\x32%.iris.cluster.Controller.JobSortFieldR\tsortField\x12M\n\x0esort_direction\x18\x06 \x01(\x0e\x32&.iris.cluster.Controller.SortDirectionR\rsortDirection\x12\x16\n\x06offset\x18\x07 \x01(\x05R\x06offset\x12\x14\n\x05limit\x18\x08 \x01(\x05R\x05limit\x1a\xc9\x01\n\x0fListJobsRequest\x12\x37\n\x05query\x18\t \x01(\x0b\x32!.iris.cluster.Controller.JobQueryR\x05queryJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x07\x10\x08R\x06offsetR\x05limitR\nsort_fieldR\x0esort_directionR\x0bname_filterR\x0cstate_filterR\rparent_job_id\x1aw\n\x10ListJobsResponse\x12\'\n\x04jobs\x18\x01 \x03(\x0b\x32\x13.iris.job.JobStatusR\x04jobs\x12\x1f\n\x0btotal_count\x18\x02 \x01(\x05R\ntotalCount\x12\x19\n\x08has_more\x18\x03 \x01(\x08R\x07hasMore\x1a/\n\x14GetTaskStatusRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x1a\x83\x01\n\x15GetTaskStatusResponse\x12(\n\x04task\x18\x01 \x01(\x0b\x32\x14.iris.job.TaskStatusR\x04task\x12@\n\rjob_resources\x18\x02 \x01(\x0b\x32\x1b.iris.job.ResourceSpecProtoR\x0cjobResources\x1a)\n\x10ListTasksRequest\x12\x15\n\x06job_id\x18\x01 \x01(\tR\x05jobId\x1a?\n\x11ListTasksResponse\x12*\n\x05tasks\x18\x01 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x05tasks\x1at\n\x16\x45xecInContainerRequest\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x18\n\x07\x63ommand\x18\x02 \x03(\tR\x07\x63ommand\x12\'\n\x0ftimeout_seconds\x18\x03 \x01(\x05R\x0etimeoutSeconds\x1a|\n\x17\x45xecInContainerResponse\x12\x1b\n\texit_code\x18\x01 \x01(\x05R\x08\x65xitCode\x12\x16\n\x06stdout\x18\x02 \x01(\tR\x06stdout\x12\x16\n\x06stderr\x18\x03 \x01(\tR\x06stderr\x12\x14\n\x05\x65rror\x18\x04 \x01(\tR\x05\x65rror\x1a\xb4\x01\n\nWorkerInfo\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x18\n\x07\x61\x64\x64ress\x18\x02 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x03 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12\x39\n\rregistered_at\x18\x04 \x01(\x0b\x32\x14.iris.time.TimestampR\x0cregisteredAt\x1a\xda\x02\n\x12WorkerHealthStatus\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x18\n\x07healthy\x18\x02 \x01(\x08R\x07healthy\x12\x31\n\x14\x63onsecutive_failures\x18\x03 \x01(\x05R\x13\x63onsecutiveFailures\x12;\n\x0elast_heartbeat\x18\x04 \x01(\x0b\x32\x14.iris.time.TimestampR\rlastHeartbeat\x12&\n\x0frunning_job_ids\x18\x05 \x03(\tR\rrunningJobIds\x12\x18\n\x07\x61\x64\x64ress\x18\x06 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x07 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12%\n\x0estatus_message\x18\x08 \x01(\tR\rstatusMessage\x1a\x14\n\x12ListWorkersRequest\x1a\\\n\x13ListWorkersResponse\x12\x45\n\x07workers\x18\x01 \x03(\x0b\x32+.iris.cluster.Controller.WorkerHealthStatusR\x07workers\x1a\xba\x01\n\x0fRegisterRequest\x12\x18\n\x07\x61\x64\x64ress\x18\x01 \x01(\tR\x07\x61\x64\x64ress\x12\x34\n\x08metadata\x18\x02 \x01(\x0b\x32\x18.iris.job.WorkerMetadataR\x08metadata\x12\x1b\n\tworker_id\x18\x03 \x01(\tR\x08workerId\x12\x19\n\x08slice_id\x18\x04 \x01(\tR\x07sliceId\x12\x1f\n\x0bscale_group\x18\x05 \x01(\tR\nscaleGroup\x1aK\n\x10RegisterResponse\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x02 \x01(\x08R\x08\x61\x63\x63\x65pted\x1a\xfc\x01\n\x08\x45ndpoint\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x18\n\x07\x61\x64\x64ress\x18\x03 \x01(\tR\x07\x61\x64\x64ress\x12\x17\n\x07task_id\x18\x04 \x01(\tR\x06taskId\x12K\n\x08metadata\x18\x05 \x03(\x0b\x32/.iris.cluster.Controller.Endpoint.MetadataEntryR\x08metadata\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\xb9\x02\n\x17RegisterEndpointRequest\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x18\n\x07\x61\x64\x64ress\x18\x02 \x01(\tR\x07\x61\x64\x64ress\x12\x17\n\x07task_id\x18\x03 \x01(\tR\x06taskId\x12Z\n\x08metadata\x18\x04 \x03(\x0b\x32>.iris.cluster.Controller.RegisterEndpointRequest.MetadataEntryR\x08metadata\x12\x1d\n\nattempt_id\x18\x05 \x01(\x05R\tattemptId\x12\x1f\n\x0b\x65ndpoint_id\x18\x06 \x01(\tR\nendpointId\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a;\n\x18RegisterEndpointResponse\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x1a<\n\x19UnregisterEndpointRequest\x12\x1f\n\x0b\x65ndpoint_id\x18\x01 \x01(\tR\nendpointId\x1a\x44\n\x14ListEndpointsRequest\x12\x16\n\x06prefix\x18\x01 \x01(\tR\x06prefix\x12\x14\n\x05\x65xact\x18\x02 \x01(\x08R\x05\x65xact\x1aX\n\x15ListEndpointsResponse\x12?\n\tendpoints\x18\x01 \x03(\x0b\x32!.iris.cluster.Controller.EndpointR\tendpoints\x1a\x1c\n\x1aGetAutoscalerStatusRequest\x1aP\n\x1bGetAutoscalerStatusResponse\x12\x31\n\x06status\x18\x01 \x01(\x0b\x32\x19.iris.vm.AutoscalerStatusR\x06status\x1a\x18\n\x16\x42\x65ginCheckpointRequest\x1a\xd6\x01\n\x17\x42\x65ginCheckpointResponse\x12\'\n\x0f\x63heckpoint_path\x18\x01 \x01(\tR\x0e\x63heckpointPath\x12\x33\n\ncreated_at\x18\x02 \x01(\x0b\x32\x14.iris.time.TimestampR\tcreatedAt\x12\x1b\n\tjob_count\x18\x03 \x01(\x05R\x08jobCount\x12\x1d\n\ntask_count\x18\x04 \x01(\x05R\ttaskCount\x12!\n\x0cworker_count\x18\x05 \x01(\x05R\x0bworkerCount\x1a\xf3\x02\n\x0bUserSummary\x12\x12\n\x04user\x18\x01 \x01(\tR\x04user\x12\x65\n\x11task_state_counts\x18\x02 \x03(\x0b\x32\x39.iris.cluster.Controller.UserSummary.TaskStateCountsEntryR\x0ftaskStateCounts\x12\x62\n\x10job_state_counts\x18\x03 \x03(\x0b\x32\x38.iris.cluster.Controller.UserSummary.JobStateCountsEntryR\x0ejobStateCounts\x1a\x42\n\x14TaskStateCountsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x05R\x05value:\x02\x38\x01\x1a\x41\n\x13JobStateCountsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x05R\x05value:\x02\x38\x01\x1a\x12\n\x10ListUsersRequest\x1aO\n\x11ListUsersResponse\x12:\n\x05users\x18\x01 \x03(\x0b\x32$.iris.cluster.Controller.UserSummaryR\x05users\x1a\x98\x02\n\x12GetTaskLogsRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12)\n\x10include_children\x18\x02 \x01(\x08R\x0fincludeChildren\x12\x19\n\x08since_ms\x18\x03 \x01(\x03R\x07sinceMs\x12&\n\x0fmax_total_lines\x18\x04 \x01(\x03R\rmaxTotalLines\x12\x1c\n\tsubstring\x18\x05 \x01(\tR\tsubstring\x12\x1d\n\nattempt_id\x18\x06 \x01(\x05R\tattemptId\x12\x1b\n\tmin_level\x18\x07 \x01(\tR\x08minLevel\x12\x16\n\x06\x63ursor\x18\x08 \x01(\x03R\x06\x63ursor\x12\x12\n\x04tail\x18\t \x01(\x08R\x04tail\x1a\x86\x01\n\x0cTaskLogBatch\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12*\n\x04logs\x18\x02 \x03(\x0b\x32\x16.iris.logging.LogEntryR\x04logs\x12\x14\n\x05\x65rror\x18\x03 \x01(\tR\x05\x65rror\x12\x1b\n\tworker_id\x18\x04 \x01(\tR\x08workerId\x1a\xd2\x01\n\x13GetTaskLogsResponse\x12\x42\n\ttask_logs\x18\x01 \x03(\x0b\x32%.iris.cluster.Controller.TaskLogBatchR\x08taskLogs\x12\x1c\n\ttruncated\x18\x02 \x01(\x08R\ttruncated\x12\x41\n\x12\x63hild_job_statuses\x18\x03 \x03(\x0b\x32\x13.iris.job.JobStatusR\x10\x63hildJobStatuses\x12\x16\n\x06\x63ursor\x18\x04 \x01(\x03R\x06\x63ursor\x1a(\n\x16GetWorkerStatusRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x1a\xe8\x03\n\x17GetWorkerStatusResponse\x12\x1f\n\x02vm\x18\x01 \x01(\x0b\x32\x0f.iris.vm.VmInfoR\x02vm\x12\x1f\n\x0bscale_group\x18\x02 \x01(\tR\nscaleGroup\x12\x43\n\x06worker\x18\x03 \x01(\x0b\x32+.iris.cluster.Controller.WorkerHealthStatusR\x06worker\x12%\n\x0e\x62ootstrap_logs\x18\x04 \x01(\tR\rbootstrapLogs\x12\x44\n\x12worker_log_entries\x18\t \x03(\x0b\x32\x16.iris.logging.LogEntryR\x10workerLogEntries\x12\x37\n\x0crecent_tasks\x18\x06 \x03(\x0b\x32\x14.iris.job.TaskStatusR\x0brecentTasks\x12M\n\x11\x63urrent_resources\x18\x07 \x01(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x10\x63urrentResources\x12K\n\x10resource_history\x18\x08 \x03(\x0b\x32 .iris.job.WorkerResourceSnapshotR\x0fresourceHistoryJ\x04\x08\x05\x10\x06\x1a\xce\x01\n\x0fSchedulingEvent\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x1d\n\nattempt_id\x18\x02 \x01(\x05R\tattemptId\x12\x1d\n\nevent_type\x18\x03 \x01(\tR\teventType\x12\x16\n\x06reason\x18\x04 \x01(\tR\x06reason\x12\x18\n\x07message\x18\x05 \x01(\tR\x07message\x12\x32\n\ttimestamp\x18\x06 \x01(\x0b\x32\x14.iris.time.TimestampR\ttimestamp\x1a\x8e\x02\n\x0f\x43lusterCapacity\x12+\n\x11schedulable_nodes\x18\x01 \x01(\x05R\x10schedulableNodes\x12\x30\n\x14total_cpu_millicores\x18\x02 \x01(\x03R\x12totalCpuMillicores\x12\x38\n\x18\x61vailable_cpu_millicores\x18\x03 \x01(\x03R\x16\x61vailableCpuMillicores\x12,\n\x12total_memory_bytes\x18\x04 \x01(\x03R\x10totalMemoryBytes\x12\x34\n\x16\x61vailable_memory_bytes\x18\x05 \x01(\x03R\x14\x61vailableMemoryBytes\x1a\x1a\n\x18GetProviderStatusRequest\x1a\xe8\x01\n\x19GetProviderStatusResponse\x12.\n\x13has_direct_provider\x18\x01 \x01(\x08R\x11hasDirectProvider\x12U\n\x11scheduling_events\x18\x02 \x03(\x0b\x32(.iris.cluster.Controller.SchedulingEventR\x10schedulingEvents\x12\x44\n\x08\x63\x61pacity\x18\x03 \x01(\x0b\x32(.iris.cluster.Controller.ClusterCapacityR\x08\x63\x61pacity\x1a#\n!GetKubernetesClusterStatusRequest\x1a\xed\x01\n\x13KubernetesPodStatus\x12\x19\n\x08pod_name\x18\x01 \x01(\tR\x07podName\x12\x17\n\x07task_id\x18\x02 \x01(\tR\x06taskId\x12\x14\n\x05phase\x18\x03 \x01(\tR\x05phase\x12\x16\n\x06reason\x18\x04 \x01(\tR\x06reason\x12\x18\n\x07message\x18\x05 \x01(\tR\x07message\x12=\n\x0flast_transition\x18\x06 \x01(\x0b\x32\x14.iris.time.TimestampR\x0elastTransition\x12\x1b\n\tnode_name\x18\x07 \x01(\tR\x08nodeName\x1a\x8f\x03\n\x0eNodePoolStatus\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12#\n\rinstance_type\x18\x02 \x01(\tR\x0cinstanceType\x12\x1f\n\x0bscale_group\x18\x03 \x01(\tR\nscaleGroup\x12!\n\x0ctarget_nodes\x18\x04 \x01(\x05R\x0btargetNodes\x12#\n\rcurrent_nodes\x18\x05 \x01(\x05R\x0c\x63urrentNodes\x12!\n\x0cqueued_nodes\x18\x06 \x01(\x05R\x0bqueuedNodes\x12*\n\x11in_progress_nodes\x18\x07 \x01(\x05R\x0finProgressNodes\x12 \n\x0b\x61utoscaling\x18\x08 \x01(\x08R\x0b\x61utoscaling\x12\x1b\n\tmin_nodes\x18\t \x01(\x05R\x08minNodes\x12\x1b\n\tmax_nodes\x18\n \x01(\x05R\x08maxNodes\x12\x1a\n\x08\x63\x61pacity\x18\x0b \x01(\tR\x08\x63\x61pacity\x12\x14\n\x05quota\x18\x0c \x01(\tR\x05quota\x1a\xac\x03\n\"GetKubernetesClusterStatusResponse\x12\x1c\n\tnamespace\x18\x01 \x01(\tR\tnamespace\x12\x1f\n\x0btotal_nodes\x18\x02 \x01(\x05R\ntotalNodes\x12+\n\x11schedulable_nodes\x18\x03 \x01(\x05R\x10schedulableNodes\x12\'\n\x0f\x61llocatable_cpu\x18\x04 \x01(\tR\x0e\x61llocatableCpu\x12-\n\x12\x61llocatable_memory\x18\x05 \x01(\tR\x11\x61llocatableMemory\x12O\n\x0cpod_statuses\x18\x06 \x03(\x0b\x32,.iris.cluster.Controller.KubernetesPodStatusR\x0bpodStatuses\x12)\n\x10provider_version\x18\x07 \x01(\tR\x0fproviderVersion\x12\x46\n\nnode_pools\x18\x08 \x03(\x0b\x32\'.iris.cluster.Controller.NodePoolStatusR\tnodePools\x1a\x33\n\x14RestartWorkerRequest\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x1aI\n\x15RestartWorkerResponse\x12\x1a\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08R\x08\x61\x63\x63\x65pted\x12\x14\n\x05\x65rror\x18\x02 \x01(\tR\x05\x65rror\x1a\x85\x01\n\x14SetUserBudgetRequest\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12\x31\n\x08max_band\x18\x03 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x1a\x17\n\x15SetUserBudgetResponse\x1a/\n\x14GetUserBudgetRequest\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x1a\xa9\x01\n\x15GetUserBudgetResponse\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12!\n\x0c\x62udget_spent\x18\x03 \x01(\x03R\x0b\x62udgetSpent\x12\x31\n\x08max_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x1a\x18\n\x16ListUserBudgetsRequest\x1a_\n\x17ListUserBudgetsResponse\x12\x44\n\x05users\x18\x01 \x03(\x0b\x32..iris.cluster.Controller.GetUserBudgetResponseR\x05users\x1al\n\x17UpdateTaskStatusRequest\x12\x1b\n\tworker_id\x18\x01 \x01(\tR\x08workerId\x12\x34\n\x07updates\x18\x02 \x03(\x0b\x32\x1a.iris.job.WorkerTaskStatusR\x07updates\x1a\x1a\n\x18UpdateTaskStatusResponse\x1a\x1a\n\x18GetSchedulerStateRequest\x1a\xa7\x02\n\x12SchedulerTaskEntry\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x15\n\x06job_id\x18\x02 \x01(\tR\x05jobId\x12\x17\n\x07user_id\x18\x03 \x01(\tR\x06userId\x12;\n\roriginal_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x0coriginalBand\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12%\n\x0equeue_position\x18\x06 \x01(\x05R\rqueuePosition\x12%\n\x0eresource_value\x18\x07 \x01(\x05R\rresourceValue\x1a\xa7\x01\n\x12SchedulerBandGroup\x12*\n\x04\x62\x61nd\x18\x01 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x04\x62\x61nd\x12\x41\n\x05tasks\x18\x02 \x03(\x0b\x32+.iris.cluster.Controller.SchedulerTaskEntryR\x05tasks\x12\"\n\rtotal_in_band\x18\x03 \x01(\x05R\x0btotalInBand\x1a\x97\x02\n\x13SchedulerUserBudget\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12!\n\x0c\x62udget_limit\x18\x02 \x01(\x03R\x0b\x62udgetLimit\x12!\n\x0c\x62udget_spent\x18\x03 \x01(\x03R\x0b\x62udgetSpent\x12\x31\n\x08max_band\x18\x04 \x01(\x0e\x32\x16.iris.job.PriorityBandR\x07maxBand\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12/\n\x13utilization_percent\x18\x06 \x01(\x02R\x12utilizationPercent\x1a\xea\x02\n\x14SchedulerRunningTask\x12\x17\n\x07task_id\x18\x01 \x01(\tR\x06taskId\x12\x15\n\x06job_id\x18\x02 \x01(\tR\x05jobId\x12\x17\n\x07user_id\x18\x03 \x01(\tR\x06userId\x12\x1b\n\tworker_id\x18\x04 \x01(\tR\x08workerId\x12=\n\x0e\x65\x66\x66\x65\x63tive_band\x18\x05 \x01(\x0e\x32\x16.iris.job.PriorityBandR\reffectiveBand\x12%\n\x0eresource_value\x18\x06 \x01(\x05R\rresourceValue\x12 \n\x0bpreemptible\x18\x07 \x01(\x08R\x0bpreemptible\x12=\n\x0epreemptible_by\x18\x08 \x03(\x0e\x32\x16.iris.job.PriorityBandR\rpreemptibleBy\x12%\n\x0eis_coscheduled\x18\t \x01(\x08R\risCoscheduled\x1a\xdc\x02\n\x19GetSchedulerStateResponse\x12P\n\rpending_queue\x18\x01 \x03(\x0b\x32+.iris.cluster.Controller.SchedulerBandGroupR\x0cpendingQueue\x12O\n\x0cuser_budgets\x18\x02 \x03(\x0b\x32,.iris.cluster.Controller.SchedulerUserBudgetR\x0buserBudgets\x12R\n\rrunning_tasks\x18\x03 \x03(\x0b\x32-.iris.cluster.Controller.SchedulerRunningTaskR\x0crunningTasks\x12#\n\rtotal_pending\x18\x04 \x01(\x05R\x0ctotalPending\x12#\n\rtotal_running\x18\x05 \x01(\x05R\x0ctotalRunning\"\xb7\x01\n\x0cJobSortField\x12\x1e\n\x1aJOB_SORT_FIELD_UNSPECIFIED\x10\x00\x12\x17\n\x13JOB_SORT_FIELD_DATE\x10\x01\x12\x17\n\x13JOB_SORT_FIELD_NAME\x10\x02\x12\x18\n\x14JOB_SORT_FIELD_STATE\x10\x03\x12\x1b\n\x17JOB_SORT_FIELD_FAILURES\x10\x04\x12\x1e\n\x1aJOB_SORT_FIELD_PREEMPTIONS\x10\x05\"`\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x16\n\x12SORT_DIRECTION_ASC\x10\x01\x12\x17\n\x13SORT_DIRECTION_DESC\x10\x02\"\x82\x01\n\rJobQueryScope\x12\x1f\n\x1bJOB_QUERY_SCOPE_UNSPECIFIED\x10\x00\x12\x17\n\x13JOB_QUERY_SCOPE_ALL\x10\x01\x12\x19\n\x15JOB_QUERY_SCOPE_ROOTS\x10\x02\x12\x1c\n\x18JOB_QUERY_SCOPE_CHILDREN\x10\x03\x32\x8e\x1c\n\x11\x43ontrollerService\x12\x62\n\tLaunchJob\x12).iris.cluster.Controller.LaunchJobRequest\x1a*.iris.cluster.Controller.LaunchJobResponse\x12k\n\x0cGetJobStatus\x12,.iris.cluster.Controller.GetJobStatusRequest\x1a-.iris.cluster.Controller.GetJobStatusResponse\x12h\n\x0bGetJobState\x12+.iris.cluster.Controller.GetJobStateRequest\x1a,.iris.cluster.Controller.GetJobStateResponse\x12M\n\x0cTerminateJob\x12,.iris.cluster.Controller.TerminateJobRequest\x1a\x0f.iris.job.Empty\x12_\n\x08ListJobs\x12(.iris.cluster.Controller.ListJobsRequest\x1a).iris.cluster.Controller.ListJobsResponse\x12n\n\rGetTaskStatus\x12-.iris.cluster.Controller.GetTaskStatusRequest\x1a..iris.cluster.Controller.GetTaskStatusResponse\x12\x62\n\tListTasks\x12).iris.cluster.Controller.ListTasksRequest\x1a*.iris.cluster.Controller.ListTasksResponse\x12_\n\x08Register\x12(.iris.cluster.Controller.RegisterRequest\x1a).iris.cluster.Controller.RegisterResponse\x12h\n\x0bListWorkers\x12+.iris.cluster.Controller.ListWorkersRequest\x1a,.iris.cluster.Controller.ListWorkersResponse\x12w\n\x10RegisterEndpoint\x12\x30.iris.cluster.Controller.RegisterEndpointRequest\x1a\x31.iris.cluster.Controller.RegisterEndpointResponse\x12Y\n\x12UnregisterEndpoint\x12\x32.iris.cluster.Controller.UnregisterEndpointRequest\x1a\x0f.iris.job.Empty\x12n\n\rListEndpoints\x12-.iris.cluster.Controller.ListEndpointsRequest\x1a..iris.cluster.Controller.ListEndpointsResponse\x12\x80\x01\n\x13GetAutoscalerStatus\x12\x33.iris.cluster.Controller.GetAutoscalerStatusRequest\x1a\x34.iris.cluster.Controller.GetAutoscalerStatusResponse\x12\x62\n\tListUsers\x12).iris.cluster.Controller.ListUsersRequest\x1a*.iris.cluster.Controller.ListUsersResponse\x12h\n\x0bGetTaskLogs\x12+.iris.cluster.Controller.GetTaskLogsRequest\x1a,.iris.cluster.Controller.GetTaskLogsResponse\x12J\n\x0bProfileTask\x12\x1c.iris.job.ProfileTaskRequest\x1a\x1d.iris.job.ProfileTaskResponse\x12t\n\x0f\x45xecInContainer\x12/.iris.cluster.Controller.ExecInContainerRequest\x1a\x30.iris.cluster.Controller.ExecInContainerResponse\x12t\n\x0fGetWorkerStatus\x12/.iris.cluster.Controller.GetWorkerStatusRequest\x1a\x30.iris.cluster.Controller.GetWorkerStatusResponse\x12t\n\x0f\x42\x65ginCheckpoint\x12/.iris.cluster.Controller.BeginCheckpointRequest\x1a\x30.iris.cluster.Controller.BeginCheckpointResponse\x12Y\n\x10GetProcessStatus\x12!.iris.job.GetProcessStatusRequest\x1a\".iris.job.GetProcessStatusResponse\x12J\n\x0bGetAuthInfo\x12\x1c.iris.job.GetAuthInfoRequest\x1a\x1d.iris.job.GetAuthInfoResponse\x12\x38\n\x05Login\x12\x16.iris.job.LoginRequest\x1a\x17.iris.job.LoginResponse\x12M\n\x0c\x43reateApiKey\x12\x1d.iris.job.CreateApiKeyRequest\x1a\x1e.iris.job.CreateApiKeyResponse\x12>\n\x0cRevokeApiKey\x12\x1d.iris.job.RevokeApiKeyRequest\x1a\x0f.iris.job.Empty\x12J\n\x0bListApiKeys\x12\x1c.iris.job.ListApiKeysRequest\x1a\x1d.iris.job.ListApiKeysResponse\x12S\n\x0eGetCurrentUser\x12\x1f.iris.job.GetCurrentUserRequest\x1a .iris.job.GetCurrentUserResponse\x12z\n\x11GetProviderStatus\x12\x31.iris.cluster.Controller.GetProviderStatusRequest\x1a\x32.iris.cluster.Controller.GetProviderStatusResponse\x12\x95\x01\n\x1aGetKubernetesClusterStatus\x12:.iris.cluster.Controller.GetKubernetesClusterStatusRequest\x1a;.iris.cluster.Controller.GetKubernetesClusterStatusResponse\x12L\n\x0f\x45xecuteRawQuery\x12\x1b.iris.query.RawQueryRequest\x1a\x1c.iris.query.RawQueryResponse\x12n\n\rRestartWorker\x12-.iris.cluster.Controller.RestartWorkerRequest\x1a..iris.cluster.Controller.RestartWorkerResponse\x12n\n\rSetUserBudget\x12-.iris.cluster.Controller.SetUserBudgetRequest\x1a..iris.cluster.Controller.SetUserBudgetResponse\x12n\n\rGetUserBudget\x12-.iris.cluster.Controller.GetUserBudgetRequest\x1a..iris.cluster.Controller.GetUserBudgetResponse\x12t\n\x0fListUserBudgets\x12/.iris.cluster.Controller.ListUserBudgetsRequest\x1a\x30.iris.cluster.Controller.ListUserBudgetsResponse\x12z\n\x11GetSchedulerState\x12\x31.iris.cluster.Controller.GetSchedulerStateRequest\x1a\x32.iris.cluster.Controller.GetSchedulerStateResponse\x12w\n\x10UpdateTaskStatus\x12\x30.iris.cluster.Controller.UpdateTaskStatusRequest\x1a\x31.iris.cluster.Controller.UpdateTaskStatusResponseBt\n\x10\x63om.iris.clusterB\x0f\x43ontrollerProtoP\x01\xa2\x02\x03ICX\xaa\x02\x0cIris.Cluster\xca\x02\x0cIris\\Cluster\xe2\x02\x18Iris\\Cluster\\GPBMetadata\xea\x02\rIris::Clusterb\x08\x65\x64itionsp\xe8\x07') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -48,7 +48,7 @@ _globals['_CONTROLLER_USERSUMMARY_JOBSTATECOUNTSENTRY']._loaded_options = None _globals['_CONTROLLER_USERSUMMARY_JOBSTATECOUNTSENTRY']._serialized_options = b'8\001' _globals['_CONTROLLER']._serialized_start=96 - _globals['_CONTROLLER']._serialized_end=11672 + _globals['_CONTROLLER']._serialized_end=11374 _globals['_CONTROLLER_LAUNCHJOBREQUEST']._serialized_start=111 _globals['_CONTROLLER_LAUNCHJOBREQUEST']._serialized_end=1206 _globals['_CONTROLLER_LAUNCHJOBRESPONSE']._serialized_start=1208 @@ -115,90 +115,84 @@ _globals['_CONTROLLER_GETAUTOSCALERSTATUSREQUEST']._serialized_end=4871 _globals['_CONTROLLER_GETAUTOSCALERSTATUSRESPONSE']._serialized_start=4873 _globals['_CONTROLLER_GETAUTOSCALERSTATUSRESPONSE']._serialized_end=4953 - _globals['_CONTROLLER_TRANSACTIONACTION']._serialized_start=4956 - _globals['_CONTROLLER_TRANSACTIONACTION']._serialized_end=5106 - _globals['_CONTROLLER_GETTRANSACTIONSREQUEST']._serialized_start=5108 - _globals['_CONTROLLER_GETTRANSACTIONSREQUEST']._serialized_end=5154 - _globals['_CONTROLLER_GETTRANSACTIONSRESPONSE']._serialized_start=5156 - _globals['_CONTROLLER_GETTRANSACTIONSRESPONSE']._serialized_end=5251 - _globals['_CONTROLLER_BEGINCHECKPOINTREQUEST']._serialized_start=5253 - _globals['_CONTROLLER_BEGINCHECKPOINTREQUEST']._serialized_end=5277 - _globals['_CONTROLLER_BEGINCHECKPOINTRESPONSE']._serialized_start=5280 - _globals['_CONTROLLER_BEGINCHECKPOINTRESPONSE']._serialized_end=5494 - _globals['_CONTROLLER_USERSUMMARY']._serialized_start=5497 - _globals['_CONTROLLER_USERSUMMARY']._serialized_end=5868 - _globals['_CONTROLLER_USERSUMMARY_TASKSTATECOUNTSENTRY']._serialized_start=5735 - _globals['_CONTROLLER_USERSUMMARY_TASKSTATECOUNTSENTRY']._serialized_end=5801 - _globals['_CONTROLLER_USERSUMMARY_JOBSTATECOUNTSENTRY']._serialized_start=5803 - _globals['_CONTROLLER_USERSUMMARY_JOBSTATECOUNTSENTRY']._serialized_end=5868 - _globals['_CONTROLLER_LISTUSERSREQUEST']._serialized_start=5870 - _globals['_CONTROLLER_LISTUSERSREQUEST']._serialized_end=5888 - _globals['_CONTROLLER_LISTUSERSRESPONSE']._serialized_start=5890 - _globals['_CONTROLLER_LISTUSERSRESPONSE']._serialized_end=5969 - _globals['_CONTROLLER_GETTASKLOGSREQUEST']._serialized_start=5972 - _globals['_CONTROLLER_GETTASKLOGSREQUEST']._serialized_end=6252 - _globals['_CONTROLLER_TASKLOGBATCH']._serialized_start=6255 - _globals['_CONTROLLER_TASKLOGBATCH']._serialized_end=6389 - _globals['_CONTROLLER_GETTASKLOGSRESPONSE']._serialized_start=6392 - _globals['_CONTROLLER_GETTASKLOGSRESPONSE']._serialized_end=6602 - _globals['_CONTROLLER_GETWORKERSTATUSREQUEST']._serialized_start=6604 - _globals['_CONTROLLER_GETWORKERSTATUSREQUEST']._serialized_end=6644 - _globals['_CONTROLLER_GETWORKERSTATUSRESPONSE']._serialized_start=6647 - _globals['_CONTROLLER_GETWORKERSTATUSRESPONSE']._serialized_end=7135 - _globals['_CONTROLLER_SCHEDULINGEVENT']._serialized_start=7138 - _globals['_CONTROLLER_SCHEDULINGEVENT']._serialized_end=7344 - _globals['_CONTROLLER_CLUSTERCAPACITY']._serialized_start=7347 - _globals['_CONTROLLER_CLUSTERCAPACITY']._serialized_end=7617 - _globals['_CONTROLLER_GETPROVIDERSTATUSREQUEST']._serialized_start=7619 - _globals['_CONTROLLER_GETPROVIDERSTATUSREQUEST']._serialized_end=7645 - _globals['_CONTROLLER_GETPROVIDERSTATUSRESPONSE']._serialized_start=7648 - _globals['_CONTROLLER_GETPROVIDERSTATUSRESPONSE']._serialized_end=7880 - _globals['_CONTROLLER_GETKUBERNETESCLUSTERSTATUSREQUEST']._serialized_start=7882 - _globals['_CONTROLLER_GETKUBERNETESCLUSTERSTATUSREQUEST']._serialized_end=7917 - _globals['_CONTROLLER_KUBERNETESPODSTATUS']._serialized_start=7920 - _globals['_CONTROLLER_KUBERNETESPODSTATUS']._serialized_end=8157 - _globals['_CONTROLLER_NODEPOOLSTATUS']._serialized_start=8160 - _globals['_CONTROLLER_NODEPOOLSTATUS']._serialized_end=8559 - _globals['_CONTROLLER_GETKUBERNETESCLUSTERSTATUSRESPONSE']._serialized_start=8562 - _globals['_CONTROLLER_GETKUBERNETESCLUSTERSTATUSRESPONSE']._serialized_end=8990 - _globals['_CONTROLLER_RESTARTWORKERREQUEST']._serialized_start=8992 - _globals['_CONTROLLER_RESTARTWORKERREQUEST']._serialized_end=9043 - _globals['_CONTROLLER_RESTARTWORKERRESPONSE']._serialized_start=9045 - _globals['_CONTROLLER_RESTARTWORKERRESPONSE']._serialized_end=9118 - _globals['_CONTROLLER_SETUSERBUDGETREQUEST']._serialized_start=9121 - _globals['_CONTROLLER_SETUSERBUDGETREQUEST']._serialized_end=9254 - _globals['_CONTROLLER_SETUSERBUDGETRESPONSE']._serialized_start=9256 - _globals['_CONTROLLER_SETUSERBUDGETRESPONSE']._serialized_end=9279 - _globals['_CONTROLLER_GETUSERBUDGETREQUEST']._serialized_start=9281 - _globals['_CONTROLLER_GETUSERBUDGETREQUEST']._serialized_end=9328 - _globals['_CONTROLLER_GETUSERBUDGETRESPONSE']._serialized_start=9331 - _globals['_CONTROLLER_GETUSERBUDGETRESPONSE']._serialized_end=9500 - _globals['_CONTROLLER_LISTUSERBUDGETSREQUEST']._serialized_start=9502 - _globals['_CONTROLLER_LISTUSERBUDGETSREQUEST']._serialized_end=9526 - _globals['_CONTROLLER_LISTUSERBUDGETSRESPONSE']._serialized_start=9528 - _globals['_CONTROLLER_LISTUSERBUDGETSRESPONSE']._serialized_end=9623 - _globals['_CONTROLLER_UPDATETASKSTATUSREQUEST']._serialized_start=9625 - _globals['_CONTROLLER_UPDATETASKSTATUSREQUEST']._serialized_end=9733 - _globals['_CONTROLLER_UPDATETASKSTATUSRESPONSE']._serialized_start=9735 - _globals['_CONTROLLER_UPDATETASKSTATUSRESPONSE']._serialized_end=9761 - _globals['_CONTROLLER_GETSCHEDULERSTATEREQUEST']._serialized_start=9763 - _globals['_CONTROLLER_GETSCHEDULERSTATEREQUEST']._serialized_end=9789 - _globals['_CONTROLLER_SCHEDULERTASKENTRY']._serialized_start=9792 - _globals['_CONTROLLER_SCHEDULERTASKENTRY']._serialized_end=10087 - _globals['_CONTROLLER_SCHEDULERBANDGROUP']._serialized_start=10090 - _globals['_CONTROLLER_SCHEDULERBANDGROUP']._serialized_end=10257 - _globals['_CONTROLLER_SCHEDULERUSERBUDGET']._serialized_start=10260 - _globals['_CONTROLLER_SCHEDULERUSERBUDGET']._serialized_end=10539 - _globals['_CONTROLLER_SCHEDULERRUNNINGTASK']._serialized_start=10542 - _globals['_CONTROLLER_SCHEDULERRUNNINGTASK']._serialized_end=10904 - _globals['_CONTROLLER_GETSCHEDULERSTATERESPONSE']._serialized_start=10907 - _globals['_CONTROLLER_GETSCHEDULERSTATERESPONSE']._serialized_end=11255 - _globals['_CONTROLLER_JOBSORTFIELD']._serialized_start=11258 - _globals['_CONTROLLER_JOBSORTFIELD']._serialized_end=11441 - _globals['_CONTROLLER_SORTDIRECTION']._serialized_start=11443 - _globals['_CONTROLLER_SORTDIRECTION']._serialized_end=11539 - _globals['_CONTROLLER_JOBQUERYSCOPE']._serialized_start=11542 - _globals['_CONTROLLER_JOBQUERYSCOPE']._serialized_end=11672 - _globals['_CONTROLLERSERVICE']._serialized_start=11675 - _globals['_CONTROLLERSERVICE']._serialized_end=15391 + _globals['_CONTROLLER_BEGINCHECKPOINTREQUEST']._serialized_start=4955 + _globals['_CONTROLLER_BEGINCHECKPOINTREQUEST']._serialized_end=4979 + _globals['_CONTROLLER_BEGINCHECKPOINTRESPONSE']._serialized_start=4982 + _globals['_CONTROLLER_BEGINCHECKPOINTRESPONSE']._serialized_end=5196 + _globals['_CONTROLLER_USERSUMMARY']._serialized_start=5199 + _globals['_CONTROLLER_USERSUMMARY']._serialized_end=5570 + _globals['_CONTROLLER_USERSUMMARY_TASKSTATECOUNTSENTRY']._serialized_start=5437 + _globals['_CONTROLLER_USERSUMMARY_TASKSTATECOUNTSENTRY']._serialized_end=5503 + _globals['_CONTROLLER_USERSUMMARY_JOBSTATECOUNTSENTRY']._serialized_start=5505 + _globals['_CONTROLLER_USERSUMMARY_JOBSTATECOUNTSENTRY']._serialized_end=5570 + _globals['_CONTROLLER_LISTUSERSREQUEST']._serialized_start=5572 + _globals['_CONTROLLER_LISTUSERSREQUEST']._serialized_end=5590 + _globals['_CONTROLLER_LISTUSERSRESPONSE']._serialized_start=5592 + _globals['_CONTROLLER_LISTUSERSRESPONSE']._serialized_end=5671 + _globals['_CONTROLLER_GETTASKLOGSREQUEST']._serialized_start=5674 + _globals['_CONTROLLER_GETTASKLOGSREQUEST']._serialized_end=5954 + _globals['_CONTROLLER_TASKLOGBATCH']._serialized_start=5957 + _globals['_CONTROLLER_TASKLOGBATCH']._serialized_end=6091 + _globals['_CONTROLLER_GETTASKLOGSRESPONSE']._serialized_start=6094 + _globals['_CONTROLLER_GETTASKLOGSRESPONSE']._serialized_end=6304 + _globals['_CONTROLLER_GETWORKERSTATUSREQUEST']._serialized_start=6306 + _globals['_CONTROLLER_GETWORKERSTATUSREQUEST']._serialized_end=6346 + _globals['_CONTROLLER_GETWORKERSTATUSRESPONSE']._serialized_start=6349 + _globals['_CONTROLLER_GETWORKERSTATUSRESPONSE']._serialized_end=6837 + _globals['_CONTROLLER_SCHEDULINGEVENT']._serialized_start=6840 + _globals['_CONTROLLER_SCHEDULINGEVENT']._serialized_end=7046 + _globals['_CONTROLLER_CLUSTERCAPACITY']._serialized_start=7049 + _globals['_CONTROLLER_CLUSTERCAPACITY']._serialized_end=7319 + _globals['_CONTROLLER_GETPROVIDERSTATUSREQUEST']._serialized_start=7321 + _globals['_CONTROLLER_GETPROVIDERSTATUSREQUEST']._serialized_end=7347 + _globals['_CONTROLLER_GETPROVIDERSTATUSRESPONSE']._serialized_start=7350 + _globals['_CONTROLLER_GETPROVIDERSTATUSRESPONSE']._serialized_end=7582 + _globals['_CONTROLLER_GETKUBERNETESCLUSTERSTATUSREQUEST']._serialized_start=7584 + _globals['_CONTROLLER_GETKUBERNETESCLUSTERSTATUSREQUEST']._serialized_end=7619 + _globals['_CONTROLLER_KUBERNETESPODSTATUS']._serialized_start=7622 + _globals['_CONTROLLER_KUBERNETESPODSTATUS']._serialized_end=7859 + _globals['_CONTROLLER_NODEPOOLSTATUS']._serialized_start=7862 + _globals['_CONTROLLER_NODEPOOLSTATUS']._serialized_end=8261 + _globals['_CONTROLLER_GETKUBERNETESCLUSTERSTATUSRESPONSE']._serialized_start=8264 + _globals['_CONTROLLER_GETKUBERNETESCLUSTERSTATUSRESPONSE']._serialized_end=8692 + _globals['_CONTROLLER_RESTARTWORKERREQUEST']._serialized_start=8694 + _globals['_CONTROLLER_RESTARTWORKERREQUEST']._serialized_end=8745 + _globals['_CONTROLLER_RESTARTWORKERRESPONSE']._serialized_start=8747 + _globals['_CONTROLLER_RESTARTWORKERRESPONSE']._serialized_end=8820 + _globals['_CONTROLLER_SETUSERBUDGETREQUEST']._serialized_start=8823 + _globals['_CONTROLLER_SETUSERBUDGETREQUEST']._serialized_end=8956 + _globals['_CONTROLLER_SETUSERBUDGETRESPONSE']._serialized_start=8958 + _globals['_CONTROLLER_SETUSERBUDGETRESPONSE']._serialized_end=8981 + _globals['_CONTROLLER_GETUSERBUDGETREQUEST']._serialized_start=8983 + _globals['_CONTROLLER_GETUSERBUDGETREQUEST']._serialized_end=9030 + _globals['_CONTROLLER_GETUSERBUDGETRESPONSE']._serialized_start=9033 + _globals['_CONTROLLER_GETUSERBUDGETRESPONSE']._serialized_end=9202 + _globals['_CONTROLLER_LISTUSERBUDGETSREQUEST']._serialized_start=9204 + _globals['_CONTROLLER_LISTUSERBUDGETSREQUEST']._serialized_end=9228 + _globals['_CONTROLLER_LISTUSERBUDGETSRESPONSE']._serialized_start=9230 + _globals['_CONTROLLER_LISTUSERBUDGETSRESPONSE']._serialized_end=9325 + _globals['_CONTROLLER_UPDATETASKSTATUSREQUEST']._serialized_start=9327 + _globals['_CONTROLLER_UPDATETASKSTATUSREQUEST']._serialized_end=9435 + _globals['_CONTROLLER_UPDATETASKSTATUSRESPONSE']._serialized_start=9437 + _globals['_CONTROLLER_UPDATETASKSTATUSRESPONSE']._serialized_end=9463 + _globals['_CONTROLLER_GETSCHEDULERSTATEREQUEST']._serialized_start=9465 + _globals['_CONTROLLER_GETSCHEDULERSTATEREQUEST']._serialized_end=9491 + _globals['_CONTROLLER_SCHEDULERTASKENTRY']._serialized_start=9494 + _globals['_CONTROLLER_SCHEDULERTASKENTRY']._serialized_end=9789 + _globals['_CONTROLLER_SCHEDULERBANDGROUP']._serialized_start=9792 + _globals['_CONTROLLER_SCHEDULERBANDGROUP']._serialized_end=9959 + _globals['_CONTROLLER_SCHEDULERUSERBUDGET']._serialized_start=9962 + _globals['_CONTROLLER_SCHEDULERUSERBUDGET']._serialized_end=10241 + _globals['_CONTROLLER_SCHEDULERRUNNINGTASK']._serialized_start=10244 + _globals['_CONTROLLER_SCHEDULERRUNNINGTASK']._serialized_end=10606 + _globals['_CONTROLLER_GETSCHEDULERSTATERESPONSE']._serialized_start=10609 + _globals['_CONTROLLER_GETSCHEDULERSTATERESPONSE']._serialized_end=10957 + _globals['_CONTROLLER_JOBSORTFIELD']._serialized_start=10960 + _globals['_CONTROLLER_JOBSORTFIELD']._serialized_end=11143 + _globals['_CONTROLLER_SORTDIRECTION']._serialized_start=11145 + _globals['_CONTROLLER_SORTDIRECTION']._serialized_end=11241 + _globals['_CONTROLLER_JOBQUERYSCOPE']._serialized_start=11244 + _globals['_CONTROLLER_JOBQUERYSCOPE']._serialized_end=11374 + _globals['_CONTROLLERSERVICE']._serialized_start=11377 + _globals['_CONTROLLERSERVICE']._serialized_end=14975 # @@protoc_insertion_point(module_scope) diff --git a/lib/iris/src/iris/rpc/controller_pb2.pyi b/lib/iris/src/iris/rpc/controller_pb2.pyi index 044157fea1..cd36a1af65 100644 --- a/lib/iris/src/iris/rpc/controller_pb2.pyi +++ b/lib/iris/src/iris/rpc/controller_pb2.pyi @@ -341,27 +341,6 @@ class Controller(_message.Message): STATUS_FIELD_NUMBER: _ClassVar[int] status: _vm_pb2.AutoscalerStatus def __init__(self, status: _Optional[_Union[_vm_pb2.AutoscalerStatus, _Mapping]] = ...) -> None: ... - class TransactionAction(_message.Message): - __slots__ = ("timestamp", "action", "entity_id", "details") - TIMESTAMP_FIELD_NUMBER: _ClassVar[int] - ACTION_FIELD_NUMBER: _ClassVar[int] - ENTITY_ID_FIELD_NUMBER: _ClassVar[int] - DETAILS_FIELD_NUMBER: _ClassVar[int] - timestamp: _time_pb2.Timestamp - action: str - entity_id: str - details: str - def __init__(self, timestamp: _Optional[_Union[_time_pb2.Timestamp, _Mapping]] = ..., action: _Optional[str] = ..., entity_id: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ... - class GetTransactionsRequest(_message.Message): - __slots__ = ("limit",) - LIMIT_FIELD_NUMBER: _ClassVar[int] - limit: int - def __init__(self, limit: _Optional[int] = ...) -> None: ... - class GetTransactionsResponse(_message.Message): - __slots__ = ("actions",) - ACTIONS_FIELD_NUMBER: _ClassVar[int] - actions: _containers.RepeatedCompositeFieldContainer[Controller.TransactionAction] - def __init__(self, actions: _Optional[_Iterable[_Union[Controller.TransactionAction, _Mapping]]] = ...) -> None: ... class BeginCheckpointRequest(_message.Message): __slots__ = () def __init__(self) -> None: ... diff --git a/lib/iris/tests/cluster/controller/test_heartbeat.py b/lib/iris/tests/cluster/controller/test_heartbeat.py index 1b3fcbcb64..b0a92474d7 100644 --- a/lib/iris/tests/cluster/controller/test_heartbeat.py +++ b/lib/iris/tests/cluster/controller/test_heartbeat.py @@ -278,7 +278,7 @@ def test_handle_failed_heartbeats_logs_diagnostics(tmp_path, worker_metadata, ca tasks_to_kill=[], ) acc = _SyncFailureAccumulator() - with caplog.at_level(logging.WARNING): + with caplog.at_level(logging.INFO, logger="iris.cluster.controller.transitions"): primary_failed_workers = controller._handle_failed_heartbeats( [(batch, "deadline exceeded after 12000ms")], acc, @@ -286,10 +286,11 @@ def test_handle_failed_heartbeats_logs_diagnostics(tmp_path, worker_metadata, ca assert primary_failed_workers == [] assert acc.fail_count == 1 - assert "worker=worker1" in caplog.text + assert "event=worker_heartbeat_failed" in caplog.text + assert "entity=worker1" in caplog.text assert "address=10.0.0.1:10001" in caplog.text - assert "action=transient_failure" in caplog.text - assert "last_success_age_s=" in caplog.text + assert "rpc_action=transient_failure" in caplog.text + assert "last_success_age_ms=" in caplog.text assert "deadline exceeded after 12000ms" in caplog.text controller.stop() diff --git a/lib/iris/tests/cluster/controller/test_transitions.py b/lib/iris/tests/cluster/controller/test_transitions.py index 55613cc670..4e07a1ba0c 100644 --- a/lib/iris/tests/cluster/controller/test_transitions.py +++ b/lib/iris/tests/cluster/controller/test_transitions.py @@ -3147,7 +3147,6 @@ def test_prune_old_terminal_jobs(state): result = state.prune_old_data( job_retention=Duration.from_seconds(86400), worker_retention=Duration.from_seconds(86400), - txn_action_retention=Duration.from_seconds(86400), profile_retention=Duration.from_seconds(86400), ) @@ -3179,7 +3178,6 @@ def test_prune_old_inactive_workers(state): result = state.prune_old_data( job_retention=Duration.from_seconds(86400), worker_retention=Duration.from_seconds(86400), - txn_action_retention=Duration.from_seconds(86400), profile_retention=Duration.from_seconds(86400), ) @@ -3188,35 +3186,18 @@ def test_prune_old_inactive_workers(state): assert _query_worker(state, stale_wid) is None # pruned -def test_prune_old_txn_actions(state): - """Old txn_actions are pruned by the txn_action retention.""" - register_worker(state, "w1", "host:8080", make_worker_metadata()) - - # Submit a job to generate txn_actions, then backdate some - req = make_job_request("txn-test") - submit_job(state, "txn-test", req) - - # Backdate all existing txn_actions to epoch - state._db.execute("UPDATE txn_actions SET created_at_ms = 1000") - - old_txn_count = state._db.fetchone("SELECT COUNT(*) as c FROM txn_actions")["c"] - - assert old_txn_count > 0 - - result = state.prune_old_data( - job_retention=Duration.from_seconds(86400), - worker_retention=Duration.from_seconds(86400), - txn_action_retention=Duration.from_seconds(86400), - profile_retention=Duration.from_seconds(86400), - ) - - assert result.txn_actions_deleted == old_txn_count +def test_submit_job_emits_structured_audit_log(state, caplog): + """submit_job logs a structured event=job_submitted line for the log-store audit trail.""" + import logging - remaining_txn_actions = state._db.fetchone("SELECT COUNT(*) as c FROM txn_actions")["c"] + req = make_job_request("audit-me") + with caplog.at_level(logging.INFO, logger="iris.cluster.controller.transitions"): + submit_job(state, "audit-me", req) - # Incremental prune deletes old txn_actions in batches; no new aggregate - # action rows are recorded for txn_action cleanup. - assert remaining_txn_actions == 0 + job_wire = JobName.root("test-user", "audit-me").to_wire() + expected = f"event=job_submitted entity={job_wire}" + messages = [r.getMessage() for r in caplog.records] + assert any(expected in msg for msg in messages), messages def test_prune_noop_when_nothing_old(state): @@ -3225,7 +3206,6 @@ def test_prune_noop_when_nothing_old(state): result = state.prune_old_data( job_retention=Duration.from_seconds(86400), worker_retention=Duration.from_seconds(86400), - txn_action_retention=Duration.from_seconds(86400), profile_retention=Duration.from_seconds(86400), ) @@ -3307,7 +3287,6 @@ def test_prune_old_data_short_circuits_when_nothing_prunable(state): result = state.prune_old_data( job_retention=Duration.from_seconds(86400), worker_retention=Duration.from_seconds(86400), - txn_action_retention=Duration.from_seconds(86400), profile_retention=Duration.from_seconds(86400), )