BauplanLabs
diff --git a/‎eudoxia/__main__.py‎
Lines changed: 41 additions & 7 deletions b/‎eudoxia/__main__.py‎
Lines changed: 41 additions & 7 deletions
diff --git a/‎eudoxia/executor/executor.py‎
Lines changed: 4 additions & 3 deletions b/‎eudoxia/executor/executor.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎eudoxia/executor/resource_pool.py‎
Lines changed: 15 additions & 1 deletion b/‎eudoxia/executor/resource_pool.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎eudoxia/scheduler/naive.py‎
Lines changed: 4 additions & 3 deletions b/‎eudoxia/scheduler/naive.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎eudoxia/scheduler/overbook.py‎
Lines changed: 2 additions & 0 deletions b/‎eudoxia/scheduler/overbook.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎eudoxia/scheduler/priority.py‎
Lines changed: 5 additions & 1 deletion b/‎eudoxia/scheduler/priority.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎eudoxia/scheduler/priority_pool.py‎
Lines changed: 6 additions & 2 deletions b/‎eudoxia/scheduler/priority_pool.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎eudoxia/scheduler/rest.py‎
Lines changed: 1 addition & 1 deletion b/‎eudoxia/scheduler/rest.py‎
Lines changed: 1 addition & 1 deletion
@@ -9,6 +9,7 @@
 from io import StringIO
 from pathlib import Path
 from eudoxia.simulator import run_simulator, parse_args_with_defaults, get_param_defaults
+from eudoxia.utils import Priority
 from eudoxia.scheduler.decorators import SCHEDULING_ALGOS
 from eudoxia.workload.csv_io import CSVWorkloadReader, CSVWorkloadWriter, WorkloadTraceGenerator
 from eudoxia.workload import WorkloadGenerator
@@ -53,25 +54,58 @@ def run_command(params_file, workload=None):
     print(f"  Assignments: {stats.assignments}")
     print(f"  Suspensions: {stats.suspensions}")
     print(f"  Failures: {stats.failures}")
-    print(f"  Failure/error counts: {stats.failure_error_counts}")
+    print(f"  Container failure counts: {stats.failure_error_counts}")
     print(f"  Mean memory allocated: {stats.mean_memory_allocated_percent:.1f}%")
     print(f"  Mean memory consumed: {stats.mean_memory_consumed_percent:.1f}%")
     print()
     print("  Pipeline Stats:")
-    print("  " + "-" * 68)
-    print(f"  {'Priority':<15} {'Arrived':>10} {'Completed':>10} {'Mean (s)':>12} {'P99 (s)':>12}")
-    print("  " + "-" * 68)
+    print("  " + "-" * 80)
+    print(f"  {'Priority':<15} {'Arrived':>10} {'Completed':>10} {'Timed Out':>10} {'Mean (s)':>12} {'P99 (s)':>12}")
+    print("  " + "-" * 80)
     pipeline_stats = [
         ("All", stats.pipelines_all),
         ("Query", stats.pipelines_query),
         ("Interactive", stats.pipelines_interactive),
         ("Batch", stats.pipelines_batch),
     ]
     for name, pstats in pipeline_stats:
-        print(f"  {name:<15} {pstats.arrival_count:>10} {pstats.completion_count:>10} {pstats.mean_latency_seconds:>12.2f} {pstats.p99_latency_seconds:>12.2f}")
-    print("  " + "-" * 68)
+        print(f"  {name:<15} {pstats.arrival_count:>10} {pstats.completion_count:>10} {pstats.timeout_count:>10} {pstats.mean_latency_seconds:>12.2f} {pstats.p99_latency_seconds:>12.2f}")
+    print("  " + "-" * 80)
     print()
-    print(f"  Adjusted latency: {stats.adjusted_latency():.2f}s")
+
+    # print adjusted latency, which puts more weight to high-priority
+    # jobs (like query).
+    #
+    # it also penalizes the metric for unfinished work, with the
+    # approach depending on whether there is a max job time.  When
+    # max_job_seconds is set, each unfinished pipeline is assigned a
+    # penalty latency of 2x the max job time.  Otherwise, the weighted
+    # mean latency is divided by the completion rate (so finishing half
+    # the work doubles the metric).
+    weights = {
+        Priority.QUERY: 10,
+        Priority.INTERACTIVE: 5,
+        Priority.BATCH_PIPELINE: 1,
+    }
+    max_job_seconds = params["max_job_seconds"]
+    if max_job_seconds > 0:
+        penalty = 2 * max_job_seconds
+        adjusted = stats.adjusted_latency(
+            weights=weights,
+            divide_by_completion_rate=False,
+            unfinished_penalty_seconds=penalty,
+        )
+        print(f"  Adjusted latency: {adjusted:.2f}s")
+        print(f"    (weights: query={weights[Priority.QUERY]}, interactive={weights[Priority.INTERACTIVE]}, batch={weights[Priority.BATCH_PIPELINE]}; "
+              f"unfinished penalty: {penalty}s = 2 * max_job_seconds)")
+    else:
+        adjusted = stats.adjusted_latency(
+            weights=weights,
+            divide_by_completion_rate=True,
+        )
+        print(f"  Adjusted latency: {adjusted:.2f}s")
+        print(f"    (weights: query={weights[Priority.QUERY]}, interactive={weights[Priority.INTERACTIVE]}, batch={weights[Priority.BATCH_PIPELINE]}; "
+              f"divided by completion rate)")
 
 
 def gentrace_command(params_file, output_file, force=False):
 
@@ -11,12 +11,13 @@ class Executor:
     Manager of a pool of resources and active containers, the Executor takes
     assignments and ensures that all costs and resources are accounted for and
     additional are allocated if instructed.
-    
+
     Acts like a cluster manager that keeps track of utilization of machines
     (that is, resource pools).
     """
-    def __init__(self, num_pools, cpus_per_pool, ram_gb_per_pool, ticks_per_second,
+    def __init__(self, clock, num_pools, cpus_per_pool, ram_gb_per_pool, ticks_per_second,
                  allow_memory_overcommit=False, **kwargs):
+        self.clock = clock
         self.num_pools = num_pools
         self.cpus_per_pool = cpus_per_pool
         self.ram_gb_per_pool = ram_gb_per_pool
@@ -26,7 +27,7 @@ def __init__(self, num_pools, cpus_per_pool, ram_gb_per_pool, ticks_per_second,
         # Initialize pools with identical resources
         self.pools: List[ResourcePool] = []
         for i in range(self.num_pools):
-            new_pool = ResourcePool(pool_id=i, cpu_pool=cpus_per_pool, ram_pool=ram_gb_per_pool,
+            new_pool = ResourcePool(clock, pool_id=i, cpu_pool=cpus_per_pool, ram_pool=ram_gb_per_pool,
                                    ticks_per_second=self.ticks_per_second,
                                    allow_memory_overcommit=allow_memory_overcommit, **kwargs)
             self.pools.append(new_pool)
 
@@ -14,9 +14,10 @@ class ResourcePool:
 
     A resource pool is analogous to a machine on which we can run containers.
     """
-    def __init__(self, pool_id, cpu_pool, ram_pool, ticks_per_second,
+    def __init__(self, clock, pool_id, cpu_pool, ram_pool, ticks_per_second,
                  multi_operator_containers=True, allow_memory_overcommit=False, **kwargs):
         # CONFIGURATION
+        self.clock = clock
         self.pool_id = pool_id
         self.ticks_per_second = ticks_per_second
         self.tick_length_secs = 1.0 / ticks_per_second
@@ -141,6 +142,16 @@ def _run_out_of_memory_killer(self):
                 break
             victim.kill("OOM")
 
+    def _run_out_of_time_killer(self):
+        """Kill containers that have ops belonging to a timed-out pipeline."""
+        for c in self.active_containers:
+            if c.is_completed():
+                continue
+            for op in c.operators:
+                if op.pipeline.runtime_status().has_timed_out():
+                    c.kill("timeout")
+                    break
+
     def run_one_tick(self, suspensions: List[Suspend],
                      assignments: List[Assignment]) -> List[ExecutionResult]:
         """
@@ -194,6 +205,9 @@ def run_one_tick(self, suspensions: List[Suspend],
         # Kill as necessary to keep within total and individual limits
         self._run_out_of_memory_killer()
 
+        # Kill containers with ops belonging to timed-out pipelines
+        self._run_out_of_time_killer()
+
         # Process completed containers (including those killed by pool-level OOM)
         to_remove = []
         for c in self.active_containers:
 
@@ -32,7 +32,7 @@ def naive_pipeline(s, results: List[ExecutionResult],
     # this case
     if len(pipelines) == 0 and len(results) == 0:
         return [], []
-    
+
     for p in pipelines:
         s.waiting_queue.append(p)
 
@@ -54,8 +54,9 @@ def naive_pipeline(s, results: List[ExecutionResult],
         # find a pipeline with ops we can assign
         while s.waiting_queue:
             pipeline = s.waiting_queue.pop(0)
-            has_failures = pipeline.runtime_status().state_counts[OperatorState.FAILED] > 0
-            if pipeline.runtime_status().is_pipeline_successful() or has_failures:
+            status = pipeline.runtime_status()
+            has_failures = status.state_counts[OperatorState.FAILED] > 0
+            if status.is_pipeline_successful() or has_failures or status.has_timed_out():
                 # we don't retry, so anything complete or with failures
                 # will be permanently removed from the queue
                 continue
 
@@ -100,6 +100,8 @@ def make_assignments(s) -> List[Assignment]:
     for op_idx, op in enumerate(s.op_queue):
         if s.pipeline_failures[op.pipeline.pipeline_id] >= MAX_FAILURES:
             continue
+        if op.pipeline.runtime_status().has_timed_out():
+            continue
         assert op.state() in ASSIGNABLE_STATES, f"op {op.id} of pipeline {op.pipeline.pipeline_id} was in queue, but has non-assignable state, {op.state()}"
 
         assignment = try_make_assignment(s, op)
 
@@ -162,8 +162,12 @@ def priority_scheduler(s, results: List[ExecutionResult],
         to_remove = []
         to_start = []
         for job in queue:
+            if job.pipeline.runtime_status().has_timed_out():
+                to_remove.append(job)
+                continue
+
             # checking which pool has the most available ram. MUST USE
-            # COPIED STATS as scheduler is placing assignments into pools  
+            # COPIED STATS as scheduler is placing assignments into pools
             # TODO: PARAMATERIZE THIS
             pool_id = get_pool_with_max_avail_ram(s, pool_stats)
             # all pools depleted
 
@@ -135,8 +135,12 @@ def priority_pool_scheduler(s, results: List[ExecutionResult],
             to_remove = []
             to_start = []
             for job in queue:
-                avail_ram = pool_stats[pool_id]["avail_ram"] 
-                avail_cpu = pool_stats[pool_id]["avail_cpu"] 
+                if job.pipeline.runtime_status().has_timed_out():
+                    to_remove.append(job)
+                    continue
+
+                avail_ram = pool_stats[pool_id]["avail_ram"]
+                avail_cpu = pool_stats[pool_id]["avail_cpu"]
 
                 # the pool is depleted so we shouldn't make any allocations
                 if avail_ram == 0 or avail_cpu == 0:
 
@@ -130,7 +130,7 @@ def rest_scheduler(s, results: List[ExecutionResult],
         s.other_pipelines[p.pipeline_id] = p
     for pipeline_id in list(s.other_pipelines.keys()):
         pipeline = s.other_pipelines[pipeline_id]
-        if pipeline.runtime_status().is_pipeline_successful():
+        if pipeline.runtime_status().is_pipeline_successful() or pipeline.runtime_status().has_timed_out():
             for op in pipeline.values:
                 del s.operator_lookup[str(op.id)]
             del s.other_pipelines[pipeline_id]