Skip to content

Commit e6bf0d7

Browse files
[iris] Duplicate user budget tiers into cluster config
Migration 0037 is UPDATE-only and only acts on pre-existing user_budgets rows, so a DB wipe or a listed user who hasn't submitted a job yet lands on UserBudgetDefaults (budget_limit=0, max_band=BATCH) instead of their intended tier. Move the tier membership into cluster config (IrisClusterConfig.user_budgets) so it is: - Version-controlled next to the rest of the cluster config - Reconciled into the DB on every controller start via an upsert - Durable across DB wipes (migration is the one-shot fixup for the already-deployed prod DB; config is the source of truth going forward) New proto message UserBudgetTier groups users sharing a budget_limit and max_band; config entries accept real PriorityBand names (PRIORITY_BAND_PRODUCTION etc.) in YAML. Co-authored-by: William Held <Helw150@users.noreply.github.com>
1 parent dfde34d commit e6bf0d7

7 files changed

Lines changed: 337 additions & 99 deletions

File tree

lib/iris/examples/marin.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,3 +176,39 @@ scale_groups:
176176
service_account: iris-worker@hai-gcp-models.iam.gserviceaccount.com
177177
mode: GCP_SLICE_MODE_VM
178178
machine_type: n2-highmem-2
179+
180+
# ---------------------------------------------------------------------------
181+
# User budget tiers
182+
#
183+
# Reconciled into user_budgets on every controller start. Mirrors the one-shot
184+
# fixup in migrations/0037_user_budget_default.py so fresh DBs (and listed
185+
# users who haven't yet submitted a job) land on the intended tier instead of
186+
# UserBudgetDefaults (budget_limit=0, max_band=BATCH).
187+
#
188+
# Unlisted submitters stay on the default BATCH cap; see
189+
# docs/priority-bands.md → "Max-band caps" for the recovery path when an
190+
# unlisted user hits PERMISSION_DENIED on an INTERACTIVE submission.
191+
# ---------------------------------------------------------------------------
192+
user_budgets:
193+
# Admins: standard budget, may submit PRIORITY_BAND_PRODUCTION for work that
194+
# must not be downgraded. INTERACTIVE work still counts against the budget.
195+
- user_ids: [runner, power, dlwh, rav, romain, held, larry]
196+
budget_limit: 75000
197+
max_band: PRIORITY_BAND_PRODUCTION
198+
# Researchers: standard budget, capped at PRIORITY_BAND_INTERACTIVE.
199+
- user_ids:
200+
- ruili
201+
- quevedo
202+
- pc0618
203+
- konwoo
204+
- ahmedah
205+
- ahmed
206+
- rohith
207+
- tim
208+
- eczech
209+
- tonyhlee
210+
- kevin
211+
- calvinxu
212+
- moojink
213+
budget_limit: 75000
214+
max_band: PRIORITY_BAND_INTERACTIVE

lib/iris/src/iris/cluster/controller/budget.py

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,20 @@
55

66
from __future__ import annotations
77

8+
import json
9+
import logging
810
from collections import defaultdict
11+
from collections.abc import Iterable
912
from dataclasses import dataclass
1013
from typing import Generic, TypeVar
1114

12-
import json
15+
from rigging.timing import Timestamp
1316

14-
from iris.cluster.controller.db import ACTIVE_TASK_STATES, QuerySnapshot
17+
from iris.cluster.controller.db import ACTIVE_TASK_STATES, ControllerDB, QuerySnapshot
1518
from iris.cluster.types import JobName
16-
from iris.rpc import job_pb2
19+
from iris.rpc import config_pb2, job_pb2
20+
21+
logger = logging.getLogger(__name__)
1722

1823
T = TypeVar("T")
1924

@@ -139,3 +144,59 @@ def interleave_by_user(
139144
break
140145
round_idx += 1
141146
return result
147+
148+
149+
# Bands accepted in user_budgets config entries. UNSPECIFIED is kept out of the
150+
# set so a missing/zeroed max_band field surfaces as a config error rather than
151+
# silently granting BATCH; callers must pick a real band.
152+
_VALID_TIER_BANDS = frozenset(
153+
(
154+
job_pb2.PRIORITY_BAND_PRODUCTION,
155+
job_pb2.PRIORITY_BAND_INTERACTIVE,
156+
job_pb2.PRIORITY_BAND_BATCH,
157+
)
158+
)
159+
160+
161+
def reconcile_user_budget_tiers(
162+
db: ControllerDB,
163+
tiers: Iterable[config_pb2.UserBudgetTier],
164+
now: Timestamp,
165+
) -> int:
166+
"""Upsert per-user budgets from cluster config into the user_budgets table.
167+
168+
Runs at controller startup after auth is resolved. Each tier entry lists
169+
a set of user_ids that all receive the same budget_limit and max_band.
170+
Tiers are applied in order, so later tiers override earlier ones for
171+
users listed in both — lets ops promote a user by appending a later tier
172+
without editing earlier ones.
173+
174+
This complements migration 0037 (which fixes prod DBs that already have
175+
rows) by handling fresh DBs and listed users who haven't submitted yet:
176+
those users would otherwise land on the :class:`UserBudgetDefaults` row
177+
created via INSERT OR IGNORE at first submission time.
178+
179+
Returns the number of (user_id, tier) pairs applied; duplicate user_ids
180+
across tiers are counted per-apply since the later tier overwrites.
181+
"""
182+
count = 0
183+
for tier in tiers:
184+
if tier.max_band not in _VALID_TIER_BANDS:
185+
raise ValueError(
186+
f"UserBudgetTier.max_band must be one of PRODUCTION/INTERACTIVE/BATCH; "
187+
f"got {tier.max_band} for users {list(tier.user_ids)}"
188+
)
189+
for user_id in tier.user_ids:
190+
if not user_id:
191+
raise ValueError("UserBudgetTier.user_ids contains an empty entry")
192+
db.ensure_user(user_id, now)
193+
db.set_user_budget(
194+
user_id=user_id,
195+
budget_limit=tier.budget_limit,
196+
max_band=tier.max_band,
197+
now=now,
198+
)
199+
count += 1
200+
if count:
201+
logger.info("Reconciled %d user budget assignment(s) from cluster config", count)
202+
return count

lib/iris/src/iris/cluster/controller/main.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,15 @@
2323
import click
2424

2525
from iris.cluster.controller.auth import ControllerAuth, create_controller_auth
26+
from iris.cluster.controller.budget import reconcile_user_budget_tiers
2627
from iris.cluster.controller.controller import Controller, ControllerConfig
2728
from iris.cluster.providers.types import port_is_open, resolve_external_host
2829
from iris.log_server.main import (
2930
AUTH_STRICT_ENV_VAR as LOG_SERVER_AUTH_STRICT_ENV_VAR,
3031
JWT_KEY_ENV_VAR as LOG_SERVER_JWT_KEY_ENV_VAR,
3132
)
3233
from iris.rpc import config_pb2
33-
from rigging.timing import Duration, ExponentialBackoff
34+
from rigging.timing import Duration, ExponentialBackoff, Timestamp
3435

3536
logger = logging.getLogger(__name__)
3637

@@ -220,6 +221,14 @@ def run_controller_serve(
220221
if auth.worker_token and base_worker_config is not None:
221222
base_worker_config.auth_token = auth.worker_token
222223

224+
# Reconcile per-user budget tiers from the cluster config into the DB.
225+
# Runs after auth so users listed in admin_users already have rows; the
226+
# upsert here adds/updates budget_limit and max_band for everyone in
227+
# cluster_config.user_budgets. Unlisted users still fall through to
228+
# UserBudgetDefaults at first-submit time.
229+
if cluster_config and cluster_config.user_budgets:
230+
reconcile_user_budget_tiers(db, cluster_config.user_budgets, Timestamp.now())
231+
223232
# --- Start log server subprocess ---
224233
log_port = port + LOG_SERVER_PORT_OFFSET
225234
log_dir = local_state_dir / "logs"

lib/iris/src/iris/rpc/config.proto

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ edition = "2023";
1616

1717
package iris.config;
1818

19+
import "job.proto";
1920
import "time.proto";
2021

2122
// Enable explicit presence tracking for all fields to support HasField on scalars
@@ -445,6 +446,28 @@ message KubernetesProviderConfig {
445446
string controller_address = 8; // Controller URL injected into task pods
446447
}
447448

449+
// ============================================================================
450+
// USER BUDGETS
451+
// ============================================================================
452+
453+
// Pre-seeds a row in the user_budgets table for each listed user at controller
454+
// startup. This is the source of truth for tier membership — the controller
455+
// upserts these into the DB on every start, so a DB wipe or a listed user
456+
// who hasn't yet submitted a job still lands on the intended budget + max
457+
// band. Tiers are applied in order; later tiers override earlier tiers for
458+
// the same user_id.
459+
message UserBudgetTier {
460+
// Users assigned this tier. A user_id must match the authenticated identity
461+
// the submitter presents (see AuthConfig); a mismatch silently drops the
462+
// user to the INSERT OR IGNORE default row created at first job submission.
463+
repeated string user_ids = 1;
464+
// Max resource-value spend before tasks are downgraded to BATCH. 0 = unlimited.
465+
int64 budget_limit = 2;
466+
// Highest priority band this user is allowed to submit to. Submissions above
467+
// this band are rejected with PERMISSION_DENIED.
468+
iris.job.PriorityBand max_band = 3;
469+
}
470+
448471
// ============================================================================
449472
// ROOT CLUSTER CONFIGURATION
450473
// ============================================================================
@@ -473,4 +496,9 @@ message IrisClusterConfig {
473496
KubernetesProviderConfig kubernetes_provider = 70;
474497
WorkerProviderConfig worker_provider = 71;
475498
}
499+
500+
// Per-user budget/max-band policy reconciled into the DB at every controller
501+
// start. Unlisted users fall through to the default in UserBudgetDefaults
502+
// (set at first job submission via INSERT OR IGNORE).
503+
repeated UserBudgetTier user_budgets = 80;
476504
}

0 commit comments

Comments
 (0)