Skip to content

Commit 56658c8

Browse files
authored
feat(prover): Autoscaler detect which pod out of resources and reschedule (#4008)
## What ❔ Autoscaler detect which pod out of resources and reschedule. For one run mark a pool as full (`max_pool_size` = number of Running + Pending pods) if there is out of resources pod in it. Add `priority` config override. Remove unneeded need_to_move_duration config parameter. <!-- What are the changes this PR brings about? --> <!-- Example: This PR adds a PR template to the repo. --> <!-- (For bigger PRs adding more context is appreciated) --> ## Why ❔ Before all pending pod could be pushed to different cluster if even one of them is out of resources. Add possibility to quickly switch to backup GPU if the main one is out. <!-- Why are these changes done? What goal do they contribute to? What are the principles behind them? --> <!-- The `Why` has to be clear to non-Matter Labs entities running their own ZK Chain --> <!-- Example: PR templates ensure PR reviewers, observers, and future iterators are in context about the evolution of repos. --> ## Is this a breaking change? - [ ] Yes - [x] No ## Operational changes <!-- Any config changes? Any new flags? Any changes to any scripts? --> <!-- Please add anything that non-Matter Labs entities running their own ZK Chain may need to know --> ## Checklist <!-- Check your PR fulfills the following items. --> <!-- For draft PRs check the boxes as you complete them. --> - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [x] Tests for the changes have been added / updated. - [x] Documentation comments have been added / updated. - [x] Code has been formatted via `zkstack dev fmt` and `zkstack dev lint`. ref ZKD-2682
1 parent 91772a4 commit 56658c8

File tree

6 files changed

+383
-66
lines changed

6 files changed

+383
-66
lines changed

prover/crates/bin/prover_autoscaler/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ agent_config:
170170
- `max_replicas` is a map of cluster name to maximum number of replicas. Note: it can be a number of map of GPU types
171171
to a number.
172172
- `speed` is a divider for corresponding queue. Note: it can be a number of map of GPU types to a number.
173+
- `priority` is an optional field to override global cluster priorities for this target. For GPU targets it's a sorted
174+
list of `[cluster, gpu]` pairs, for simple targets it's just list of clusters.
173175

174176
Example:
175177

@@ -212,13 +214,21 @@ scaler_config:
212214
speed:
213215
L4: 500
214216
T4: 400
217+
priority:
218+
- [cluster1, H100]
219+
- [cluster2, H100]
220+
- [cluster1, L4]
221+
- [cluster3, T4]
215222
- queue_report_field: basic_witness_jobs
216223
deployment: witness-generator-basic-fri
217224
min_replicas: 1
218225
max_replicas:
219226
cluster1: 10
220227
cluster2: 20
221228
speed: 4
229+
priority:
230+
- cluster2
231+
- cluster1
222232
- queue_report_field: leaf_witness_jobs
223233
deployment: witness-generator-leaf-fri
224234
max_replicas:

prover/crates/bin/prover_autoscaler/src/cluster_types.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ string_type!(ClusterName);
2121
string_type!(NamespaceName);
2222
string_type!(DeploymentName);
2323

24-
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
24+
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
2525
pub struct Pod {
2626
pub owner: String,
2727
pub status: String,
2828
pub changed: DateTime<Utc>,
29+
pub out_of_resources: bool,
2930
}
31+
3032
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
3133
pub struct Deployment {
3234
pub running: usize,

prover/crates/bin/prover_autoscaler/src/config.rs

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,6 @@ pub struct ProverAutoscalerScalerConfig {
7979
default = "ProverAutoscalerScalerConfig::default_scale_errors_duration"
8080
)]
8181
pub scale_errors_duration: Duration,
82-
/// Time window for which Autoscaler forces pending pod migration due to scale errors.
83-
#[serde(
84-
with = "humantime_serde",
85-
default = "ProverAutoscalerScalerConfig::default_need_to_move_duration"
86-
)]
87-
pub need_to_move_duration: Duration,
8882
/// List of simple autoscaler targets.
8983
pub scaler_targets: Vec<ScalerTarget>,
9084
/// If dry-run enabled don't send any scale requests.
@@ -136,6 +130,13 @@ impl ScalarOrMap {
136130
}
137131
}
138132

133+
#[derive(Debug, Clone, PartialEq, Deserialize)]
134+
#[serde(untagged)]
135+
pub enum PriorityConfig {
136+
Gpu(Vec<(ClusterName, GpuKey)>),
137+
Simple(Vec<ClusterName>),
138+
}
139+
139140
#[derive(Debug, Default, Display, Clone, Copy, PartialEq, EnumString, Deserialize)]
140141
pub enum ScalerTargetType {
141142
#[default]
@@ -161,6 +162,11 @@ pub struct ScalerTarget {
161162
/// The queue will be divided by the speed and rounded up to get number of replicas.
162163
#[serde(default = "ScalerTarget::default_speed")]
163164
pub speed: ScalarOrMap,
165+
/// Optional priority list that overrides global cluster_priorities.
166+
/// For GPU targets, this is a list of (ClusterName, GpuKey) tuples.
167+
/// For Simple targets, this is a list of ClusterName.
168+
#[serde(default)]
169+
pub priority: Option<PriorityConfig>,
164170
}
165171

166172
impl ProverAutoscalerConfig {
@@ -204,11 +210,6 @@ impl ProverAutoscalerScalerConfig {
204210
pub fn default_scale_errors_duration() -> Duration {
205211
Duration::from_secs(3600)
206212
}
207-
208-
/// Default long_pending_duration -- 4m
209-
pub fn default_need_to_move_duration() -> Duration {
210-
Duration::from_secs(4 * 60)
211-
}
212213
}
213214

214215
impl ScalerTarget {

prover/crates/bin/prover_autoscaler/src/global/manager.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,6 @@ impl Manager {
5151
scale_errors_duration: chrono::Duration::seconds(
5252
config.scale_errors_duration.as_secs() as i64,
5353
),
54-
need_to_move_duration: chrono::Duration::seconds(
55-
config.need_to_move_duration.as_secs() as i64,
56-
),
5754
});
5855

5956
for c in &config.scaler_targets {
@@ -69,6 +66,7 @@ impl Manager {
6966
.collect(),
7067
c.speed.into_map_gpukey(),
7168
scaler_config.clone(),
69+
c.priority.clone(),
7270
))),
7371
ScalerTargetType::Simple => scalers.push(Box::new(Scaler::<NoKey>::new(
7472
c.queue_report_field,
@@ -80,6 +78,7 @@ impl Manager {
8078
.collect(),
8179
c.speed.into_map_nokey(),
8280
scaler_config.clone(),
81+
c.priority.clone(),
8382
))),
8483
};
8584
}

0 commit comments

Comments
 (0)