implement max job time

tylerharter · tylerharter · commit 977f9c97aff0 · 2026-04-09T22:10:06.000-05:00
diff --git a/eudoxia/__main__.py b/eudoxia/__main__.py
@@ -53,7 +53,7 @@ def run_command(params_file, workload=None):
     print(f"  Assignments: {stats.assignments}")
     print(f"  Suspensions: {stats.suspensions}")
     print(f"  Failures: {stats.failures}")
-    print(f"  Failure/error counts: {stats.failure_error_counts}")
+    print(f"  Container failure counts: {stats.failure_error_counts}")
     print(f"  Mean memory allocated: {stats.mean_memory_allocated_percent:.1f}%")
     print(f"  Mean memory consumed: {stats.mean_memory_consumed_percent:.1f}%")
     print()
diff --git a/eudoxia/executor/executor.py b/eudoxia/executor/executor.py
@@ -64,7 +64,7 @@ def get_consumed_ram_gb(self) -> float:
         """Return total consumed RAM across all pools in GB."""
         return sum(p.get_consumed_ram_gb() for p in self.pools)
 
-    def run_one_tick(self, suspensions: List[Suspend],
+    def run_one_tick(self, current_tick: int, suspensions: List[Suspend],
                      assignments: List[Assignment]) -> List[ExecutionResult]:
         '''
         Largely passing through relevant assignments to the pool they belong to.
@@ -73,7 +73,7 @@ def run_one_tick(self, suspensions: List[Suspend],
         for id_ in range(self.num_pools):
             pool_suspensions = [s for s in suspensions if s.pool_id == id_]
             pool_assignments = [a for a in assignments if a.pool_id == id_]
-            pool_results = self.pools[id_].run_one_tick(pool_suspensions, pool_assignments)
+            pool_results = self.pools[id_].run_one_tick(current_tick, pool_suspensions, pool_assignments)
             results.extend(pool_results)
 
         return results
diff --git a/eudoxia/executor/resource_pool.py b/eudoxia/executor/resource_pool.py
@@ -141,7 +141,17 @@ def _run_out_of_memory_killer(self):
                 break
             victim.kill("OOM")
 
-    def run_one_tick(self, suspensions: List[Suspend],
+    def _run_out_of_time_killer(self, current_tick: int):
+        """Kill containers that have ops belonging to a timed-out pipeline."""
+        for c in self.active_containers:
+            if c.is_completed():
+                continue
+            for op in c.operators:
+                if op.pipeline.runtime_status().has_timed_out(current_tick):
+                    c.kill("timeout")
+                    break
+
+    def run_one_tick(self, current_tick: int, suspensions: List[Suspend],
                      assignments: List[Assignment]) -> List[ExecutionResult]:
         """
         Run a single tick for the executor, decrement remaining ticks for all
@@ -194,6 +204,9 @@ def run_one_tick(self, suspensions: List[Suspend],
         # Kill as necessary to keep within total and individual limits
         self._run_out_of_memory_killer()
 
+        # Kill containers with ops belonging to timed-out pipelines
+        self._run_out_of_time_killer(current_tick)
+
         # Process completed containers (including those killed by pool-level OOM)
         to_remove = []
         for c in self.active_containers:
diff --git a/eudoxia/simulator.py b/eudoxia/simulator.py
@@ -230,6 +230,10 @@ def get_param_defaults() -> Dict:
         # random seed for workload generation
         "random_seed": 42,
 
+        ### Pipeline Params ###
+        # maximum job time in seconds (0 = no limit)
+        "max_job_seconds": 0,
+
         ### Estimator Params ###
         # estimator algorithm: "" (default, no estimator) or "noisy"
         "estimator_algo": "",
@@ -315,8 +319,9 @@ def run_simulator(param_input: Union[str, Dict], workload: Workload = None) -> S
         handler.setFormatter(sim_formatter)
 
     tick_number = 0
-    max_ticks = int(params["duration"] * params["ticks_per_second"])
-    logger.info(f"Running for {params['duration']}s or {max_ticks} ticks")
+    max_simulation_ticks = int(params["duration"] * params["ticks_per_second"])
+    max_job_ticks = int(params["max_job_seconds"] * ticks_per_second)
+    logger.info(f"Running for {params['duration']}s or {max_simulation_ticks} ticks")
     logger.info(f"Running with random seed {params['random_seed']}")
 
     # a pipeline may have many operators.  These can get grouped
@@ -329,6 +334,13 @@ def run_simulator(param_input: Union[str, Dict], workload: Workload = None) -> S
     num_failures = 0
     failure_error_counts = defaultdict(int)
     executor_results = []
+    # outstanding_pipelines tracks pipelines we still expect to complete.
+    # Pipelines are removed when they succeed or time out.  Timed-out
+    # pipelines simply show up as not completed — we don't record their
+    # latency.  Note: the scheduler manages its own queues independently,
+    # so it may still assign ops for a pipeline that has already been
+    # removed from here.  That's fine — the executor's _run_out_of_time_killer
+    # will immediately kill any such containers.
     outstanding_pipelines: Dict[str, Pipeline] = {}
     pipeline_arrivals_by_priority: Dict[Priority, int] = {
         Priority.QUERY: 0,
@@ -344,22 +356,22 @@ def run_simulator(param_input: Union[str, Dict], workload: Workload = None) -> S
     memory_consumed_percent_samples: List[float] = []
 
     # IMPORTANT!  This is the main simulation loop.
-    for tick_number in range(max_ticks):
+    for tick_number in range(max_simulation_ticks):
         sim_formatter.set_simulated_elapsed_seconds(tick_number / ticks_per_second)
 
         # track new work
         new_pipelines: List[Pipeline] = workload.run_one_tick()
         for p in new_pipelines:
             logger.info(f"Pipeline arrived with Priority {p.priority} and {len(p.values)} op(s)")
-            p.runtime_status().record_arrival(tick_number)
+            p.runtime_status().record_arrival(tick_number, max_job_ticks)
             outstanding_pipelines[p.pipeline_id] = p
             pipeline_arrivals_by_priority[p.priority] += 1
             for op in p.values:
                 estimator.estimate(op)
 
         # simulate scheduler/executor
         suspensions, assignments = scheduler.run_one_tick(executor_results, new_pipelines)
-        executor_results = executor.run_one_tick(suspensions, assignments)
+        executor_results = executor.run_one_tick(tick_number, suspensions, assignments)
 
         # track stats
         num_pipelines_created += len(new_pipelines)
@@ -380,6 +392,14 @@ def run_simulator(param_input: Union[str, Dict], workload: Workload = None) -> S
                     pipeline_latencies_by_priority[pipeline.priority].append(latency_ticks)
                     del outstanding_pipelines[pipeline_id]
 
+        # Optimization: periodically remove timed-out pipelines so we
+        # don't keep scanning them for completion every tick.
+        if max_job_ticks > 0 and tick_number % ticks_per_second == 0:
+            for pipeline_id in list(outstanding_pipelines.keys()):
+                pipeline = outstanding_pipelines[pipeline_id]
+                if pipeline.runtime_status().has_timed_out(tick_number):
+                    del outstanding_pipelines[pipeline_id]
+
         # log memory stats every 1 second of simulated time
         if tick_number % ticks_per_second == 0:
             total_ram = executor.get_total_ram_gb()
diff --git a/eudoxia/workload/runtime_status.py b/eudoxia/workload/runtime_status.py
@@ -56,15 +56,24 @@ def __init__(self, pipeline: 'Pipeline'):
         self.state_counts: Dict[OperatorState, int] = {state: 0 for state in OperatorState}
         self.arrival_tick: Optional[int] = None
         self.finish_tick: Optional[int] = None
+        self.max_job_ticks: int = 0
 
         for operator in pipeline.values:
             self.operator_states[operator] = OperatorState.PENDING
             self.state_counts[OperatorState.PENDING] += 1
 
-    def record_arrival(self, tick: int):
+    def record_arrival(self, tick: int, max_job_ticks: int = 0):
         """Record the tick at which this pipeline arrived."""
         assert self.arrival_tick is None, "arrival_tick already recorded"
         self.arrival_tick = tick
+        self.max_job_ticks = max_job_ticks
+
+    def has_timed_out(self, current_tick: int) -> bool:
+        """Check if this pipeline has exceeded its maximum allowed job time."""
+        if self.max_job_ticks <= 0:
+            return False
+        assert self.arrival_tick is not None, "arrival_tick not recorded"
+        return (current_tick - self.arrival_tick) >= self.max_job_ticks
 
     def check_transition(self, operator: 'Operator', new_state: OperatorState) -> tuple[bool, Optional[str]]:
         """
diff --git a/tests/test_container.py b/tests/test_container.py
@@ -39,13 +39,13 @@ def test_container_oom():
     )
 
     # Start container via pool
-    pool.run_one_tick([], [assignment])
+    pool.run_one_tick(0, [], [assignment])
     container = pool.active_containers[0]
 
     # Run until completion
     ticks_executed = 1  # First tick already done
     while not container.is_completed():
-        pool.run_one_tick([], [])
+        pool.run_one_tick(0, [], [])
         ticks_executed += 1
         assert ticks_executed <= 1000, "Container should complete within 1000 ticks"
 
@@ -77,10 +77,10 @@ def test_container_oom_transitions_remaining_ops_to_failed():
     )
 
     # Start container and run until OOM
-    pool.run_one_tick([], [assignment])
+    pool.run_one_tick(0, [], [assignment])
     container = pool.active_containers[0]
     while not container.is_completed():
-        pool.run_one_tick([], [])
+        pool.run_one_tick(0, [], [])
 
     assert container.error == "OOM"
 
@@ -112,7 +112,7 @@ def test_container_immediate_oom():
     )
 
     # First tick creates container and runs OOM killer
-    results = pool.run_one_tick([], [assignment])
+    results = pool.run_one_tick(0, [], [assignment])
 
     # Container should be killed immediately
     assert len(results) == 1, "Should have one result"
diff --git a/tests/test_executor.py b/tests/test_executor.py
@@ -69,7 +69,7 @@ def test_resource_pool_basic():
     results_by_pipeline = {}
 
     for tick in range(max_ticks):
-        results = pool.run_one_tick(suspensions, assignments)
+        results = pool.run_one_tick(0, suspensions, assignments)
 
         # Organize results by pipeline_id
         for result in results:
@@ -116,7 +116,7 @@ def test_resource_pool_dependencies():
     )
 
     with pytest.raises(AssertionError, match="Dependencies not satisfied"):
-        pool.run_one_tick([], [assignment_b])
+        pool.run_one_tick(0, [], [assignment_b])
 
 
 def test_runtime_status_dependencies():
@@ -214,7 +214,7 @@ def test_memory_allocated_vs_consumed():
     completed = False
     for tick in range(1000):
         assignments = [assignment] if tick == 0 else []
-        results = executor.run_one_tick([], assignments)
+        results = executor.run_one_tick(0, [], assignments)
 
         if results:
             # Container just completed: both should be 0
@@ -287,7 +287,7 @@ def test_memory_overcommit_kills_highest_scorer():
     )
 
     # Start both containers
-    pool.run_one_tick([], [assignment_a, assignment_b])
+    pool.run_one_tick(0, [], [assignment_a, assignment_b])
 
     # Tick until consumption exceeds capacity
     # Memory grows at DISK_SCAN_GB_SEC = 20 GB/sec
@@ -297,7 +297,7 @@ def test_memory_overcommit_kills_highest_scorer():
 
     killed_pipeline = None
     for tick in range(100):
-        results = pool.run_one_tick([], [])
+        results = pool.run_one_tick(0, [], [])
         for r in results:
             if r.failed():
                 # Get pipeline ID from the failed container's ops
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
@@ -157,7 +157,7 @@ def test_op_not_double_queued_across_ticks():
     assert remaining_in_queue > 0, "some children should remain in queue"
 
     # Actually consume resources by passing assignments to executor
-    executor.run_one_tick([], assignments)
+    executor.run_one_tick(0, [], assignments)
 
     # op2 completes
     op2.transition(OperatorState.COMPLETED)
@@ -196,7 +196,7 @@ def test_failed_op_gets_retry_stats():
     first_ram = assignments[0].ram
 
     # Run executor - op will OOM because it needs 10GB but only got 5GB
-    results = executor.run_one_tick([], assignments)
+    results = executor.run_one_tick(0, [], assignments)
     assert len(results) == 1
     assert results[0].error == "OOM"
 
@@ -236,9 +236,9 @@ def test_partial_failure_unblocks_dependent_op():
     assert set(assignments[0].ops) == {op1, op2, op3}
 
     # Run executor until we get a result (op1 should complete, then op2 OOMs)
-    results = executor.run_one_tick([], assignments)
+    results = executor.run_one_tick(0, [], assignments)
     while not results:
-        results = executor.run_one_tick([], [])
+        results = executor.run_one_tick(0, [], [])
     assert len(results) == 1
     assert results[0].error == "OOM"
     assert op1.state() == OperatorState.COMPLETED, "op1 should have completed before OOM"