Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 135 additions & 7 deletions lib/iris/dashboard/src/components/controller/AutoscalerTab.vue
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ onMounted(refresh)

const expandedDemand = ref<Set<string>>(new Set())
const expandedSlices = ref<Set<string>>(new Set())
const collapsedPools = ref<Set<string>>(new Set())

function togglePool(pool: string) {
const next = new Set(collapsedPools.value)
next.has(pool) ? next.delete(pool) : next.add(pool)
collapsedPools.value = next
}

function toggleDemand(name: string) {
const next = new Set(expandedDemand.value)
Expand Down Expand Up @@ -106,12 +113,20 @@ interface AvailabilityBadge {
classes: string
}

function groupAvailabilityBadge(group: ScaleGroupStatus): AvailabilityBadge | null {
function groupAvailabilityBadge(group: ScaleGroupStatus, section?: PoolSection): AvailabilityBadge | null {
const status = group.availabilityStatus
const blockedMs = timestampMs(group.blockedUntil)
const cooldownMs = timestampMs(group.scaleUpCooldownUntil)
const now = Date.now()

// Check for tier-blocked state (pool monotonicity)
if (section && section.blockedAtTier) {
const tier = group.config?.allocationTier ?? 0
if (tier > section.blockedAtTier) {
return { label: 'tier-blocked', classes: 'bg-status-danger-bg text-status-danger border-status-danger-border opacity-60' }
}
}

if (status === 'requesting') {
return { label: 'in-flight', classes: 'bg-status-purple-bg text-status-purple border-status-purple-border' }
}
Expand Down Expand Up @@ -221,6 +236,74 @@ const sortedGroupStatuses = computed<GroupRoutingStatus[]>(() => {
})
})

// Pool grouping for tier chain display
interface PoolSection {
pool: string
groups: GroupRoutingStatus[]
blockedAtTier: number | null // lowest tier in quota_exceeded/backoff, or null
}

const poolSections = computed<PoolSection[]>(() => {
const poolMap = new Map<string, GroupRoutingStatus[]>()
const unpooled: GroupRoutingStatus[] = []

for (const gs of sortedGroupStatuses.value) {
const group = groupIndex.value[gs.group]
const pool = group?.config?.quotaPool
if (pool) {
if (!poolMap.has(pool)) poolMap.set(pool, [])
poolMap.get(pool)!.push(gs)
} else {
unpooled.push(gs)
}
}

const sections: PoolSection[] = []
for (const [pool, poolGroups] of poolMap) {
// Sort by allocation_tier within the pool
poolGroups.sort((a, b) => {
const ta = groupIndex.value[a.group]?.config?.allocationTier ?? 0
const tb = groupIndex.value[b.group]?.config?.allocationTier ?? 0
return ta - tb
})

// Find the lowest blocked tier
let blockedAtTier: number | null = null
for (const gs of poolGroups) {
const group = groupIndex.value[gs.group]
if (!group) continue
const tier = group.config?.allocationTier ?? 0
const status = group.availabilityStatus
if (tier > 0 && (status === 'quota_exceeded' || status === 'backoff')) {
if (blockedAtTier === null || tier < blockedAtTier) {
blockedAtTier = tier
}
}
}

sections.push({ pool, groups: poolGroups, blockedAtTier })
}

if (unpooled.length > 0) {
sections.push({ pool: '', groups: unpooled, blockedAtTier: null })
Comment thread
rjpower marked this conversation as resolved.
}

return sections
})

function isTierBlocked(gs: GroupRoutingStatus, section: PoolSection): boolean {
if (!section.blockedAtTier) return false
const group = groupIndex.value[gs.group]
const tier = group?.config?.allocationTier ?? 0
return tier > section.blockedAtTier
}

function tierLabel(gs: GroupRoutingStatus): string {
const group = groupIndex.value[gs.group]
const tier = group?.config?.allocationTier ?? 0
return tier > 0 ? `T${tier}` : ''
}

function isInactiveRow(gs: GroupRoutingStatus): boolean {
const group = groupIndex.value[gs.group]
const counts = group?.sliceStateCounts ?? {}
Expand Down Expand Up @@ -463,12 +546,56 @@ function idleThresholdMs(groupName: string): number {
</tr>
</thead>
<tbody>
<template v-for="gs in sortedGroupStatuses" :key="gs.group">
<template v-for="section in poolSections" :key="section.pool || '__unpooled'">
<!-- Pool header row -->
<tr v-if="section.pool" class="bg-surface border-b border-surface-border cursor-pointer hover:bg-surface-raised" @click="togglePool(section.pool)">
<td colspan="8" class="px-3 py-1.5">
<div class="flex items-center gap-2">
<span class="text-[10px] text-text-muted">
{{ collapsedPools.has(section.pool) ? '▶' : '▼' }}
</span>
<span class="text-xs font-semibold uppercase tracking-wider text-text-secondary">
Pool: {{ section.pool }}
</span>
<span
v-if="section.blockedAtTier"
class="inline-flex items-center px-1.5 py-0.5 rounded text-xs border
bg-status-danger-bg text-status-danger border-status-danger-border"
>
blocked at tier {{ section.blockedAtTier }}+
</span>
<!-- Tier chain visualization -->
<span class="flex items-center gap-0.5 text-xs text-text-muted ml-2">
<template v-for="(gs, idx) in section.groups" :key="gs.group">
<span v-if="idx > 0" class="text-text-muted mx-0.5">&rarr;</span>
<span
:class="[
'px-1 py-0.5 rounded border text-[11px] font-mono',
isTierBlocked(gs, section)
? 'bg-status-danger-bg text-status-danger border-status-danger-border line-through'
: groupIndex[gs.group]?.availabilityStatus === 'quota_exceeded'
? 'bg-status-danger-bg text-status-danger border-status-danger-border'
: groupIndex[gs.group]?.availabilityStatus === 'backoff'
? 'bg-status-orange-bg text-status-orange border-status-orange-border'
: 'bg-surface border-surface-border text-text-secondary',
]"
>
{{ tierLabel(gs) }}
</span>
</template>
</span>
</div>
</td>
</tr>

<template v-for="gs in section.groups" :key="gs.group">
<!-- Main row -->
<tr
v-if="!section.pool || !collapsedPools.has(section.pool)"
:class="[
'border-b border-surface-border-subtle hover:bg-surface-raised transition-colors',
isInactiveRow(gs) ? 'opacity-50' : '',
isTierBlocked(gs, section) ? 'opacity-40' : '',
]"
>
<!-- Priority -->
Expand All @@ -488,14 +615,14 @@ function idleThresholdMs(groupName: string): number {
&#x26a0; {{ groupFailures(gs.group) }} fail{{ groupFailures(gs.group) > 1 ? 's' : '' }}
</span>
</div>
<div v-if="groupIndex[gs.group] && groupAvailabilityBadge(groupIndex[gs.group])" class="mt-0.5">
<div v-if="groupIndex[gs.group] && groupAvailabilityBadge(groupIndex[gs.group], section)" class="mt-0.5">
<span
:class="[
'inline-flex items-center px-1.5 py-0.5 rounded text-xs border',
groupAvailabilityBadge(groupIndex[gs.group])!.classes,
groupAvailabilityBadge(groupIndex[gs.group], section)!.classes,
]"
>
{{ groupAvailabilityBadge(groupIndex[gs.group])!.label }}
{{ groupAvailabilityBadge(groupIndex[gs.group], section)!.label }}
</span>
</div>
</td>
Expand Down Expand Up @@ -576,7 +703,7 @@ function idleThresholdMs(groupName: string): number {
</tr>

<!-- Slice detail (expanded) -->
<tr v-if="expandedSlices.has(gs.group) && groupHasSlices(gs.group)" class="bg-surface-sunken">
<tr v-if="expandedSlices.has(gs.group) && groupHasSlices(gs.group) && (!section.pool || !collapsedPools.has(section.pool))" class="bg-surface-sunken">
<td colspan="8" class="px-6 py-3">
<div class="space-y-1.5">
<div
Expand Down Expand Up @@ -615,7 +742,7 @@ function idleThresholdMs(groupName: string): number {
</tr>

<!-- Demand detail (expanded) -->
<tr v-if="expandedDemand.has(gs.group) && groupDemand(gs.group) > 0" class="bg-surface-sunken">
<tr v-if="expandedDemand.has(gs.group) && groupDemand(gs.group) > 0 && (!section.pool || !collapsedPools.has(section.pool))" class="bg-surface-sunken">
<td colspan="8" class="px-6 py-3">
<div class="space-y-1">
<div
Expand All @@ -634,6 +761,7 @@ function idleThresholdMs(groupName: string): number {
</td>
</tr>
</template>
</template>
</tbody>
</table>
</div>
Expand Down
7 changes: 6 additions & 1 deletion lib/iris/dashboard/src/types/rpc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,14 @@ export interface SliceInfo {
idle?: boolean
}

export interface ScaleGroupConfig {
quotaPool?: string
allocationTier?: number
}

export interface ScaleGroupStatus {
name: string
config?: Record<string, unknown>
config?: ScaleGroupConfig
currentDemand?: number
peakDemand?: number
backoffUntil?: ProtoTimestamp
Expand Down
140 changes: 140 additions & 0 deletions lib/iris/docs/tpu-pool-expansion.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# TPU Pool Expansion & Allocation Tiers

**Status:** implementation plan
**Last updated:** 2026-04-03

## Problem

When the autoscaler can't allocate a TPU of size X in a zone, it falls through the priority waterfall and tries size 2X, 4X, etc. This is wasteful and dangerous:

1. **Wasted API calls**: GCP TPU capacity is monotonic — if v5p-8 is unavailable, v5p-16 will also be unavailable. Each failed attempt burns rate limit tokens and adds latency.
2. **Accidental over-allocation**: If a larger slice transiently succeeds, the job gets more resources than intended and is more likely to be preempted.
3. **Config verbosity**: Each TPU size × zone is a separate scale group entry. The production config has ~35 nearly-identical entries that differ only in size-derived fields.

## Design

Two changes:

### 1. TPU Pool Config Sugar (`tpu_pools`)

A new top-level YAML key that expands into scale groups. Each pool defines shared properties for a TPU family; the `sizes` map lists per-size overrides.

```yaml
tpu_pools:
v5e-preemptible:
family: v5e
zones: [europe-west4-b, us-west4-a]
base_priority: 10
resources: { cpu: 112, ram: 192GB, disk: 100GB, preemptible: true }
slice_template:
gcp:
service_account: iris-worker@hai-gcp-models.iam.gserviceaccount.com
runtime_version: v2-alpha-tpuv5-lite
sizes:
4: { min_slices: 3, max_slices: 1024 }
8: { max_slices: 512 }
16: { max_slices: 256 }
```

The pool name (`v5e-preemptible`) is an operator-chosen label, independent of the TPU family. This allows multiple pools for the same family (e.g., `v5e-preemptible` vs `v5e-reserved` with different zones, priorities, and preemptibility).

**Expansion** (`_expand_tpu_pools`): For each pool × size × zone, emit a scale group:

- **name**: `tpu_{pool}_{size}-{zone}` (e.g., `tpu_v5e-preemptible_16-europe-west4-b`)
- **device_variant**: looked up from `TpuTopologyInfo` via `family` (e.g., `v5e` → `v5litepod-16`)
- **num_vms**: `TpuTopologyInfo.vm_count`
- **device_count**: `TpuTopologyInfo.chips_per_vm`
- **device_type**: `tpu` (injected)
- **priority**: `base_priority + (tier_index × 10)` where tier_index is the 0-based position in sorted sizes
- **quota_pool**: `{pool_name}/{zone}` (e.g., `v5e-preemptible/europe-west4-b`). Per-zone because GCP quota is per-zone — a failure in one zone should not block allocation in another.
- **allocation_tier**: `tier_index + 1` (1-based)
- **min_slices**: from size entry, default 0
- **max_slices**: from size entry (required)
- **zone, region**: set on `slice_template.gcp.zone` and `worker.attributes`

The function runs before `_expand_multi_zone_groups` and handles zone expansion itself (TPU pools don't go through the generic zone expander).

**Family → variant mapping**: A dict in `types.py`:

```python
TPU_FAMILY_VARIANT_PREFIX: dict[str, str] = {
"v4": "v4",
"v5e": "v5litepod",
"v5p": "v5p",
"v6e": "v6e",
}
```

The variant name for a pool with `family: v5e` and size `16` is `v5litepod-16`. This is validated against `TPU_TOPOLOGIES` — unknown family/size combinations are rejected at config load time.

### 2. Allocation Tiers (`quota_pool` + `allocation_tier`)

Two new fields on `ScaleGroupConfig`:

```protobuf
message ScaleGroupConfig {
// Groups sharing a quota_pool propagate quota-exceeded state together.
// When tier N in a pool hits quota, tiers > N are blocked.
string quota_pool = 80;
int32 allocation_tier = 81;
}
```

**Autoscaler behavior** change in `route_demand`:

When filtering matching groups for a demand entry, skip groups where **any lower-tier group in the same quota_pool** is in `QUOTA_EXCEEDED` or `BACKOFF` state. This is a filter applied after hard constraint matching and before budget assignment.

```python
def _pool_blocked_tiers(groups: list[ScalingGroup], ts: Timestamp) -> dict[str, int]:
"""Return the minimum blocked tier per quota_pool.

If pool "v5e" has tier 1 in QUOTA_EXCEEDED, returns {"v5e": 1},
meaning tiers >= 1 should be skipped.
"""
blocked: dict[str, int] = {}
for g in groups:
pool = g.config.quota_pool
tier = g.config.allocation_tier
if not pool or not tier:
continue
avail = g.availability(ts)
if avail.status in (GroupAvailability.QUOTA_EXCEEDED, GroupAvailability.BACKOFF):
if pool not in blocked or tier < blocked[pool]:
blocked[pool] = tier
return blocked
```

In `route_demand`, after `matching_groups` is computed:

```python
blocked = _pool_blocked_tiers(sorted_groups, ts)
matching_groups = [
g for g in matching_groups
if not g.config.quota_pool
or g.config.allocation_tier < blocked.get(g.config.quota_pool, float('inf'))
]
```

**Dashboard**: The AutoscalerTab groups scale groups by `quota_pool` when present, showing a visual tier chain: `[v5p-8 ✓] → [v5p-16 ⊘] → [v5p-32 ⊘]`.

## Implementation Plan

### Stage 1: Proto + config expansion

1. Add `TPU_FAMILY_VARIANT_PREFIX` dict to `types.py`
2. Add `quota_pool` and `allocation_tier` fields to `ScaleGroupConfig` in `config.proto`, regenerate
3. Implement `_expand_tpu_pools()` in `config.py`
4. Wire into `load_config()` before `_expand_multi_zone_groups()`
5. Tests: expansion correctness, topology derivation, validation errors
6. Migrate `examples/marin.yaml` to `tpu_pools` format

### Stage 2: Autoscaler tier blocking

7. Implement `_pool_blocked_tiers()` in `autoscaler.py`
8. Add tier filtering to `route_demand()`
9. Tests: tier blocking on quota exceeded, independent pools, groups without pools

### Stage 3: Dashboard

10. Group autoscaler view by `quota_pool`
11. Show tier chain with blocked/available visual state
Loading