feat(prover): Autoscaler detect which pod out of resources and reschedule (#4008)

yorik · web-flow · commit 56658c89f83f · 2025-05-14T14:27:00.000Z
## What ❔

Autoscaler detect which pod out of resources and reschedule.
For one run mark a pool as full (`max_pool_size` = number of Running +
Pending pods) if there is out of resources pod in it.
Add `priority` config override.
Remove unneeded need_to_move_duration config parameter.

&lt;!-- What are the changes this PR brings about? --&gt;
&lt;!-- Example: This PR adds a PR template to the repo. --&gt;
&lt;!-- (For bigger PRs adding more context is appreciated) --&gt;

## Why ❔

Before all pending pod could be pushed to different cluster if even one
of them is out of resources.
Add possibility to quickly switch to backup GPU if the main one is out.

&lt;!-- Why are these changes done? What goal do they contribute to? What
are the principles behind them? --&gt;
&lt;!-- The `Why` has to be clear to non-Matter Labs entities running their
own ZK Chain --&gt;
&lt;!-- Example: PR templates ensure PR reviewers, observers, and future
iterators are in context about the evolution of repos. --&gt;

## Is this a breaking change?
- [ ] Yes
- [x] No

## Operational changes
&lt;!-- Any config changes? Any new flags? Any changes to any scripts? --&gt;
&lt;!-- Please add anything that non-Matter Labs entities running their own
ZK Chain may need to know --&gt;

## Checklist

&lt;!-- Check your PR fulfills the following items. --&gt;
&lt;!-- For draft PRs check the boxes as you complete them. --&gt;

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `zkstack dev fmt` and `zkstack dev
lint`.

ref ZKD-2682
diff --git a/prover/crates/bin/prover_autoscaler/README.md b/prover/crates/bin/prover_autoscaler/README.md
@@ -170,6 +170,8 @@ agent_config:
   - `max_replicas` is a map of cluster name to maximum number of replicas. Note: it can be a number of map of GPU types
     to a number.
   - `speed` is a divider for corresponding queue. Note: it can be a number of map of GPU types to a number.
+  - `priority` is an optional field to override global cluster priorities for this target. For GPU targets it's a sorted
+    list of `[cluster, gpu]` pairs, for simple targets it's just list of clusters.
 
 Example:
 
@@ -212,13 +214,21 @@ scaler_config:
       speed:
         L4: 500
         T4: 400
+      priority:
+        - [cluster1, H100]
+        - [cluster2, H100]
+        - [cluster1, L4]
+        - [cluster3, T4]
     - queue_report_field: basic_witness_jobs
       deployment: witness-generator-basic-fri
       min_replicas: 1
       max_replicas:
         cluster1: 10
         cluster2: 20
       speed: 4
+      priority:
+        - cluster2
+        - cluster1
     - queue_report_field: leaf_witness_jobs
       deployment: witness-generator-leaf-fri
       max_replicas:
diff --git a/prover/crates/bin/prover_autoscaler/src/cluster_types.rs b/prover/crates/bin/prover_autoscaler/src/cluster_types.rs
@@ -21,12 +21,14 @@ string_type!(ClusterName);
 string_type!(NamespaceName);
 string_type!(DeploymentName);
 
-#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
 pub struct Pod {
     pub owner: String,
     pub status: String,
     pub changed: DateTime<Utc>,
+    pub out_of_resources: bool,
 }
+
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct Deployment {
     pub running: usize,
diff --git a/prover/crates/bin/prover_autoscaler/src/config.rs b/prover/crates/bin/prover_autoscaler/src/config.rs
@@ -79,12 +79,6 @@ pub struct ProverAutoscalerScalerConfig {
         default = "ProverAutoscalerScalerConfig::default_scale_errors_duration"
     )]
     pub scale_errors_duration: Duration,
-    /// Time window for which Autoscaler forces pending pod migration due to scale errors.
-    #[serde(
-        with = "humantime_serde",
-        default = "ProverAutoscalerScalerConfig::default_need_to_move_duration"
-    )]
-    pub need_to_move_duration: Duration,
     /// List of simple autoscaler targets.
     pub scaler_targets: Vec<ScalerTarget>,
     /// If dry-run enabled don't send any scale requests.
@@ -136,6 +130,13 @@ impl ScalarOrMap {
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Deserialize)]
+#[serde(untagged)]
+pub enum PriorityConfig {
+    Gpu(Vec<(ClusterName, GpuKey)>),
+    Simple(Vec<ClusterName>),
+}
+
 #[derive(Debug, Default, Display, Clone, Copy, PartialEq, EnumString, Deserialize)]
 pub enum ScalerTargetType {
     #[default]
@@ -161,6 +162,11 @@ pub struct ScalerTarget {
     /// The queue will be divided by the speed and rounded up to get number of replicas.
     #[serde(default = "ScalerTarget::default_speed")]
     pub speed: ScalarOrMap,
+    /// Optional priority list that overrides global cluster_priorities.
+    /// For GPU targets, this is a list of (ClusterName, GpuKey) tuples.
+    /// For Simple targets, this is a list of ClusterName.
+    #[serde(default)]
+    pub priority: Option<PriorityConfig>,
 }
 
 impl ProverAutoscalerConfig {
@@ -204,11 +210,6 @@ impl ProverAutoscalerScalerConfig {
     pub fn default_scale_errors_duration() -> Duration {
         Duration::from_secs(3600)
     }
-
-    /// Default long_pending_duration -- 4m
-    pub fn default_need_to_move_duration() -> Duration {
-        Duration::from_secs(4 * 60)
-    }
 }
 
 impl ScalerTarget {
diff --git a/prover/crates/bin/prover_autoscaler/src/global/manager.rs b/prover/crates/bin/prover_autoscaler/src/global/manager.rs
@@ -51,9 +51,6 @@ impl Manager {
             scale_errors_duration: chrono::Duration::seconds(
                 config.scale_errors_duration.as_secs() as i64,
             ),
-            need_to_move_duration: chrono::Duration::seconds(
-                config.need_to_move_duration.as_secs() as i64,
-            ),
         });
 
         for c in &config.scaler_targets {
@@ -69,6 +66,7 @@ impl Manager {
                         .collect(),
                     c.speed.into_map_gpukey(),
                     scaler_config.clone(),
+                    c.priority.clone(),
                 ))),
                 ScalerTargetType::Simple => scalers.push(Box::new(Scaler::<NoKey>::new(
                     c.queue_report_field,
@@ -80,6 +78,7 @@ impl Manager {
                         .collect(),
                     c.speed.into_map_nokey(),
                     scaler_config.clone(),
+                    c.priority.clone(),
                 ))),
             };
         }
diff --git a/prover/crates/bin/prover_autoscaler/src/global/scaler.rs b/prover/crates/bin/prover_autoscaler/src/global/scaler.rs
diff --git a/prover/crates/bin/prover_autoscaler/src/k8s/watcher.rs b/prover/crates/bin/prover_autoscaler/src/k8s/watcher.rs