Merge pull request #867 from punch-mission/updates

svank · web-flow · commit 4aed9d8d03d4 · 2026-04-08T14:42:29.000-06:00
Updates
diff --git a/changelog/867.feature.2.rst b/changelog/867.feature.2.rst
@@ -0,0 +1 @@
+The whole pipeline's memory use is now capped.
diff --git a/changelog/867.feature.3.rst b/changelog/867.feature.3.rst
@@ -0,0 +1 @@
+The config file can now contain per-server values.
diff --git a/changelog/867.feature.4.rst b/changelog/867.feature.4.rst
@@ -0,0 +1 @@
+Starfield generation uses 32-bit floats to reduce memory pressure.
diff --git a/changelog/867.feature.rst b/changelog/867.feature.rst
@@ -0,0 +1 @@
+The maximum number of running flows can be capped.
diff --git a/punchbowl/auto/cli.py b/punchbowl/auto/cli.py
@@ -157,8 +157,14 @@ def run(configuration_path, launch_prefect=False, launch_dask_cluster=False):
         try:
             numa_prefix_control = ["numactl", "--localalloc", "--physcpubind=0-11"]
             numa_prefix_workers = ["numactl", "--localalloc", "--physcpubind=12-63,64-125,192-255"]
+            # This starts our pipeline in a cgroup context in which the collective memory use of the pipeline and
+            # all subprocesses, including shared memory, is capped at a certain amount. If we approach an
+            # out-of-memory condition, this hopefully contains it to the pipeline while keeping the rest of the
+            # server (including, critically, SSH access) healthy.
+            mem_limit_prefix = ["systemd-run", "--scope", "-p", "MemoryMax=1800G", "-p", "MemoryHigh=1700G", "--user",
+                                "--description", "Limit pipeline memory"]
             if launch_prefect:
-                print("Launcing prefect")
+                print("Launching prefect")
                 prefect_process = subprocess.Popen(
                     [*numa_prefix_control, "prefect", "server", "start", "--no-services"], stdout=f, stderr=f)
                 time.sleep(5)
@@ -181,7 +187,8 @@ def run(configuration_path, launch_prefect=False, launch_dask_cluster=False):
             # These processes send a _lot_ of output, so we let it go to the screen instead of making the log file
             # enormous
             def data_process_launcher() -> subprocess.Popen:
-                return subprocess.Popen([*numa_prefix_workers, "punchpipe", "serve-data", configuration_path])
+                return subprocess.Popen([*mem_limit_prefix, *numa_prefix_workers, "punchpipe", "serve-data",
+                                         configuration_path])
 
             def control_process_launcher() -> subprocess.Popen:
                 return subprocess.Popen([*numa_prefix_control, "punchpipe", "serve-control", configuration_path])
diff --git a/punchbowl/auto/control/launcher.py b/punchbowl/auto/control/launcher.py
@@ -121,7 +121,7 @@ def escalate_long_waiting_flows(session, pipeline_config):
 
 
 def determine_launchable_flow_count(weight_planned, weight_running, max_weight_running, max_weight_to_launch,
-                                    max_flows_to_launch):
+                                    max_flows_to_launch, max_flows_running, num_running_flows):
     logger = get_run_logger()
     amount_to_launch = max_weight_running - weight_running
     logger.info(f"Total weight {amount_to_launch:.2f} can be launched at this time.")
@@ -130,7 +130,12 @@ def determine_launchable_flow_count(weight_planned, weight_running, max_weight_r
     amount_to_launch = max(0, amount_to_launch)
     logger.info(f"Will launch up to {amount_to_launch:.2f} weight and {max_flows_to_launch} flows")
 
-    return min(amount_to_launch, weight_planned), max_flows_to_launch
+    number_launchable = max_flows_running - num_running_flows
+    n_to_launch = min(number_launchable, max_flows_to_launch)
+    logger.info(f"{num_running_flows} flows running now. Max {max_flows_running} total, {max_flows_to_launch} per "
+                f"launch window. Launching up to {n_to_launch}.")
+
+    return min(amount_to_launch, weight_planned), n_to_launch
 
 
 @task(cache_policy=NO_CACHE)
@@ -295,9 +300,11 @@ async def launcher(pipeline_config_path=None):
     max_weight_running = pipeline_config["control"]["launcher"]["max_weight_running"]
     max_weight_to_launch = pipeline_config["control"]["launcher"]["max_weight_to_launch_at_once"]
     max_flows_to_launch = pipeline_config["control"]["launcher"]["max_flows_to_launch_at_once"]
+    max_flows_running = pipeline_config["control"]["launcher"]["max_flows_running"]
 
     weight_to_launch, max_flows_to_launch = determine_launchable_flow_count(
-        weight_planned, weight_running, max_weight_running, max_weight_to_launch, max_flows_to_launch)
+        weight_planned, weight_running, max_weight_running, max_weight_to_launch, max_flows_to_launch,
+        max_flows_running, num_running_flows)
 
     flows_to_launch, tags_by_flow, selected_weight, number_of_flows, counts_per_type = gather_planned_flows(
         session, weight_to_launch, max_flows_to_launch, flow_weights, flow_enabled, flow_batch_sizes, flow_hosts)
diff --git a/punchbowl/auto/control/tests/test_launcher.py b/punchbowl/auto/control/tests/test_launcher.py
@@ -158,7 +158,7 @@ def test_filter_for_launchable_flows(db, flow_weights, flow_hosts):
         running_count, planned_count, weight_planned, weight_running = count_flows.fn(db, flow_weights, flow_hosts)
         max_weight_running = 30
         ready_to_launch_weight, max_flows_to_launch = determine_launchable_flow_count(
-            weight_planned, weight_running, max_weight_running, math.inf, 10)
+            weight_planned, weight_running, max_weight_running, math.inf, 10, 100, running_count)
         assert ready_to_launch_weight == 5
         assert max_flows_to_launch == 10
 
@@ -168,7 +168,7 @@ def test_filter_for_launchable_flows_with_max_of_1(db, flow_weights, flow_batch_
         running_count, planned_count, weight_planned, weight_running = count_flows.fn(db, flow_weights, flow_hosts)
         max_weight_running = 1
         ready_to_launch_weight, max_flows_to_launch = determine_launchable_flow_count(
-            weight_planned, weight_running, max_weight_running, math.inf, 10)
+            weight_planned, weight_running, max_weight_running, math.inf, 10, 100, running_count)
         assert ready_to_launch_weight == 1
         assert max_flows_to_launch == 10
         flows, tags_by_flow, selected_weight, number_of_flows, count_per_type = gather_planned_flows.fn(
@@ -184,16 +184,26 @@ def test_filter_for_launchable_flows_with_max_of_0(db, flow_weights, flow_hosts)
         running_count, planned_count, weight_planned, weight_running = count_flows.fn(db, flow_weights, flow_hosts)
         max_weight_running = 0
         ready_to_launch_weight, max_flows_to_launch = determine_launchable_flow_count(
-            weight_planned, weight_running, max_weight_running, math.inf, 0)
+            weight_planned, weight_running, max_weight_running, math.inf, 0, 100, running_count)
         assert ready_to_launch_weight == 0
         assert max_flows_to_launch == 0
 
 
+def test_filter_for_launchable_flows_cap_by_number(db, flow_weights, flow_hosts):
+    with prefect_test_harness(), disable_run_logger():
+        running_count, planned_count, weight_planned, weight_running = count_flows.fn(db, flow_weights, flow_hosts)
+        max_weight_running = 100
+        ready_to_launch_weight, max_flows_to_launch = determine_launchable_flow_count(
+            weight_planned, weight_running, max_weight_running, math.inf, 100, 1, running_count)
+        assert ready_to_launch_weight > 0
+        assert max_flows_to_launch == 1
+
+
 def test_filter_for_launchable_flows_with_empty_db(db_empty, flow_weights, flow_hosts):
     with prefect_test_harness(), disable_run_logger():
         running_count, planned_count, weight_planned, weight_running = count_flows.fn(db_empty, flow_weights, flow_hosts)
         max_weight_running = 30
         ready_to_launch_weight, max_flows_to_launch = determine_launchable_flow_count(
-            weight_planned, weight_running, max_weight_running, math.inf, 20)
+            weight_planned, weight_running, max_weight_running, math.inf, 20, 100, running_count)
         assert ready_to_launch_weight == 0
         assert max_flows_to_launch == 20
diff --git a/punchbowl/auto/control/util.py b/punchbowl/auto/control/util.py
@@ -1,5 +1,6 @@
 import os
 import re
+import socket
 from math import inf
 from datetime import UTC, datetime
 from itertools import islice
@@ -65,10 +66,22 @@ def load_pipeline_configuration(path: str = None) -> dict:
         path = run_coro_as_sync(path)
     with open(path) as f:
         config = yaml.load(f, Loader=FullLoader)
+    hostify_config(config)
     # TODO: add validation
     return config
 
 
+def hostify_config(config):
+    tag = '-' + socket.gethostname().split(".")[0]
+    for key in list(config.keys()):
+        if isinstance(config[key], dict):
+            hostify_config(config[key])
+        if isinstance(key, str) and key.endswith(tag):
+            new_key = key.replace(tag, '')
+            config[new_key] = config[key]
+            del config[key]
+
+
 def load_quicklook_scaling(level: str = None, product: str = None, obscode: str = None, path: str = None) -> (float, float):
     if path is None:
         path = Variable.get("punchpipe_config", "punchpipe_config.yaml")
diff --git a/punchbowl/level3/stellar.py b/punchbowl/level3/stellar.py
@@ -134,7 +134,8 @@ def __init__(self, layer: int | None = None, apply_mask: bool = True, key: str =
 
     def load_image(self, filename: str) -> ImageHolder:
         """Load an image."""
-        cube = load_ndcube_from_fits(filename, key=self.key, include_provenance=False, include_uncertainty=False)
+        cube = load_ndcube_from_fits(filename, key=self.key, include_provenance=False, include_uncertainty=False,
+                                     dtype=np.float32)
 
         if self.apply_mask:
             mask = (cube.data[self.layer] == 0) if self.layer is not None else (cube.data == 0)
@@ -258,6 +259,7 @@ def generate_starfield_background(
             n_procs=n_procs,
             processor=PUNCHImageProcessor(0, apply_mask=True, key="A"),
             handle_wrap_point=False,
+            dtype=np.float32,
             target_mem_usage=target_mem_usage)
         logger.info("Ending m starfield")
         out_data_m = starfield_m.starfield - percentile_filter(starfield_m.starfield, 5, 10)
@@ -273,6 +275,7 @@ def generate_starfield_background(
             n_procs=n_procs,
             processor=PUNCHImageProcessor(1, apply_mask=True, key="A"),
             handle_wrap_point=False,
+            dtype=np.float32,
             target_mem_usage=target_mem_usage)
         logger.info("Ending z starfield")
         out_data_z = starfield_z.starfield - percentile_filter(starfield_z.starfield, 5, 10)
@@ -288,6 +291,7 @@ def generate_starfield_background(
             n_procs=n_procs,
             processor=PUNCHImageProcessor(2, apply_mask=True, key="A"),
             handle_wrap_point=False,
+            dtype=np.float32,
             target_mem_usage=target_mem_usage)
         logger.info("Ending p starfield")
         out_data_p = starfield_p.starfield - percentile_filter(starfield_p.starfield, 5, 10)
@@ -306,6 +310,7 @@ def generate_starfield_background(
             n_procs=n_procs,
             processor=PUNCHImageProcessor(None, apply_mask=True, key="A"),
             handle_wrap_point=False,
+            dtype=np.float32,
             target_mem_usage=target_mem_usage)
         logger.info("Ending clear starfield")
         out_data = starfield_clear.starfield - percentile_filter(starfield_clear.starfield, 5, 10)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+The whole pipeline's memory use is now capped.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+The config file can now contain per-server values.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Starfield generation uses 32-bit floats to reduce memory pressure.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+The maximum number of running flows can be capped.`