Merge pull request #310 from punch-mission/pipeline-adjustments

jmbhughes · web-flow · commit a239b1bddfb7 · 2025-12-08T13:42:11.000-07:00
Pipeline adjustments
diff --git a/punchpipe/control/db.py b/punchpipe/control/db.py
@@ -66,6 +66,7 @@ def directory(self, root: str):
 Index("construct_background", File.level, File.observatory, File.outlier, File.date_obs, File.state, File.file_type)
 Index("get_cal_file", File.file_type, File.observatory, File.date_obs, File.state)
 Index("CNN", File.file_type, File.observatory, File.level, File.state, File.outlier)
+Index("processing_flow_index", File.processing_flow)
 
 
 class Flow(Base):
diff --git a/punchpipe/control/launcher.py b/punchpipe/control/launcher.py
@@ -26,7 +26,7 @@ def gather_planned_flows(session, weight_to_launch, max_flows_to_launch, flow_we
     flows = (session.query(Flow)
                    .where(Flow.state == "planned")
                    .where(Flow.flow_type.in_(enabled_flows))
-                   .order_by(Flow.is_backprocessing.asc(), Flow.priority.desc(), Flow.creation_time.desc())
+                   .order_by(Flow.is_backprocessing.asc(), Flow.priority.desc(), Flow.creation_time.asc())
                    .limit(max_to_select).all())
     selected_flows = []
     selected_weight = 0
diff --git a/punchpipe/control/processor.py b/punchpipe/control/processor.py
@@ -57,17 +57,23 @@ def generic_process_flow_logic(flow_id: int | list[int], core_flow_to_launch, pi
             file_db_entry_list = session.query(File).where(File.processing_flow == flow_db_entry.flow_id).all()
 
             # update the file database entries as being created
-            if file_db_entry_list:
-                for file_db_entry in file_db_entry_list:
-                    if file_db_entry.state != "planned":
-                        raise RuntimeError(f"File id {file_db_entry.file_id} has already been created.")
-                    if os.path.exists(os.path.join(
-                            file_db_entry.directory(pipeline_config['root']), file_db_entry.filename())):
-                        raise RuntimeError(f"Expected output file {file_db_entry.filename()} (id {file_db_entry.file_id}) "
-                                            "already exists on disk")
-                    file_db_entry.state = "creating"
-            else:
-                raise RuntimeError("There should be at least one file associated with this flow. Found 0.")
+            try:
+                if file_db_entry_list:
+                    for file_db_entry in file_db_entry_list:
+                        if file_db_entry.state != "planned":
+                            raise RuntimeError(f"File id {file_db_entry.file_id} has already been created.")
+                        if os.path.exists(os.path.join(
+                                file_db_entry.directory(pipeline_config['root']), file_db_entry.filename())):
+                            raise RuntimeError(f"Expected output file {file_db_entry.filename()} (id {file_db_entry.file_id}) "
+                                                "already exists on disk")
+                        file_db_entry.state = "creating"
+                else:
+                    raise RuntimeError("There should be at least one file associated with this flow. Found 0.")
+            except:
+                # The exception handler rolls back the transaction, but we do want our start_time to stay in place. So
+                # commit on error, but otherwise let the transaction keep growing into a big batch
+                session.commit()
+                raise
             file_db_entry_lists.append(file_db_entry_list)
         session.commit()
 
diff --git a/punchpipe/control/scheduler.py b/punchpipe/control/scheduler.py
@@ -111,6 +111,11 @@ def generic_scheduler_flow_logic(
             database_flow_info = construct_child_flow_info(parent_files, children_files,
                                                            pipeline_config, session=session,
                                                            reference_time=reference_time, **args_dictionary)
+            # We've had some failures where a flow reports "no associated files", despite the output files having
+            # their processing_flow set properly. Best guess is the DB is running slow, and so the new flow has been
+            # committed but the files' processing_flow hasn't been updated yet. So let's not let the state be
+            # 'planned' until everything is in place.
+            database_flow_info.state = 'being_planned'
             if backprocess_cutoff := pipeline_config.get('prioritize_most_recent_n_days', None):
                 cutoff = datetime.now(UTC) - timedelta(days=backprocess_cutoff)
                 if all(cf.date_obs.replace(tzinfo=UTC) < cutoff for cf in children_files):
@@ -138,5 +143,7 @@ def generic_scheduler_flow_logic(
         for parent_file, child_file in iterable:
             session.add(FileRelationship(parent=parent_file.file_id, child=child_file.file_id))
 
+        database_flow_info.state = 'planned'
+
     session.commit()
     return len(ready_files)
diff --git a/punchpipe/flows/fcorona.py b/punchpipe/flows/fcorona.py
@@ -118,7 +118,7 @@ def construct_f_corona_background_scheduler_flow(pipeline_config_path=None, sess
         logger.info("Flow 'construct_f_corona_background' is not enabled---halting scheduler")
         return 0
 
-    max_flows = 2 * pipeline_config['flows']['construct_f_corona_background'].get('concurrency_limit', 1000)
+    max_flows = pipeline_config['flows']['construct_f_corona_background'].get('concurrency_limit', 1000)
     existing_flows = (session.query(Flow)
                       .where(Flow.flow_type == 'construct_f_corona_background')
                       .where(Flow.state.in_(["planned", "launched", "running"])).count())
diff --git a/punchpipe/flows/level3.py b/punchpipe/flows/level3.py
@@ -53,7 +53,6 @@ def level3_PTM_query_ready_files(session, pipeline_config: dict, reference_time=
     return [[f.file_id] for f in actually_ready_files]
 
 
-@task(cache_policy=NO_CACHE)
 def level3_PTM_construct_flow_info(level2_files: list[File], level3_file: File,
                                    pipeline_config: dict, session=None, reference_time=None):
     session = get_database_session()  # TODO: replace so this works in the tests by passing in a test
@@ -83,7 +82,6 @@ def level3_PTM_construct_flow_info(level2_files: list[File], level3_file: File,
     )
 
 
-@task(cache_policy=NO_CACHE)
 def level3_PTM_construct_file_info(input_files: t.List[File], pipeline_config: dict, reference_time=None) -> t.List[File]:
     date_obses = [f.date_obs for f in input_files]
 
@@ -151,7 +149,6 @@ def level3_PIM_query_ready_files(session, pipeline_config: dict, reference_time=
     return [[f.file_id] for f in actually_ready_files]
 
 
-@task(cache_policy=NO_CACHE)
 def level3_PIM_construct_flow_info(level2_files: list[File], level3_file: File, pipeline_config: dict,
                                    session=None, reference_time=None):
     session = get_database_session()  # TODO: replace so this works in the tests by passing in a test
@@ -188,7 +185,6 @@ def level3_PIM_construct_flow_info(level2_files: list[File], level3_file: File,
     )
 
 
-@task(cache_policy=NO_CACHE)
 def level3_PIM_construct_file_info(level2_files: t.List[File], pipeline_config: dict, reference_time=None) -> t.List[File]:
     date_obses = [f.date_obs for f in level2_files]
 
@@ -260,7 +256,6 @@ def level3_CIM_query_ready_files(session, pipeline_config: dict, reference_time=
     return [[f.file_id] for f in actually_ready_files]
 
 
-@task(cache_policy=NO_CACHE)
 def level3_CIM_construct_flow_info(level2_files: list[File], level3_file: File, pipeline_config: dict,
                                    session=None, reference_time=None):
     session = get_database_session()  # TODO: replace so this works in the tests by passing in a test
@@ -298,7 +293,6 @@ def level3_CIM_construct_flow_info(level2_files: list[File], level3_file: File,
     )
 
 
-@task(cache_policy=NO_CACHE)
 def level3_CIM_construct_file_info(level2_files: t.List[File], pipeline_config: dict, reference_time=None) -> t.List[File]:
     date_obses = [f.date_obs for f in level2_files]
 
@@ -366,7 +360,6 @@ def level3_CTM_query_ready_files(session, pipeline_config: dict, reference_time=
     return [[f.file_id] for f in actually_ready_files]
 
 
-@task(cache_policy=NO_CACHE)
 def level3_CTM_construct_flow_info(level2_files: list[File], level3_file: File,
                                    pipeline_config: dict, session=None, reference_time=None):
     session = get_database_session()  # TODO: replace so this works in the tests by passing in a test
@@ -397,7 +390,6 @@ def level3_CTM_construct_flow_info(level2_files: list[File], level3_file: File,
     )
 
 
-@task(cache_policy=NO_CACHE)
 def level3_CTM_construct_file_info(input_files: t.List[File], pipeline_config: dict, reference_time=None, ) -> t.List[File]:
     date_obses = [f.date_obs for f in input_files]
 
diff --git a/punchpipe/flows/stray_light.py b/punchpipe/flows/stray_light.py
@@ -190,7 +190,7 @@ def construct_polarized_stray_light_check_for_inputs(session,
         for group in second_half_inputs[:max_files_per_half]:
             all_ready_files.extend(group)
 
-        logger.info(f"{len(all_ready_files)} Level 1 P*{reference_files[0].observatory} files will be used "
+        logger.info(f"{len(all_ready_files)} Level 1 Y*{reference_files[0].observatory} files will be used "
                      "for stray light estimation.")
         return [f.file_id for f in all_ready_files]
     return []
@@ -293,7 +293,7 @@ def construct_stray_light_scheduler_flow(pipeline_config_path=None, session=None
         logger.info("Flow 'construct_stray_light' is not enabled---halting scheduler")
         return
 
-    max_flows = 2 * pipeline_config['flows']['construct_stray_light'].get('concurrency_limit', 1000)
+    max_flows = pipeline_config['flows']['construct_stray_light'].get('concurrency_limit', 1000)
     existing_flows = (session.query(Flow)
                       .where(Flow.flow_type == 'construct_stray_light')
                       .where(Flow.state.in_(["planned", "launched", "running"])).count())
diff --git a/punchpipe/speedster.py b/punchpipe/speedster.py
@@ -37,7 +37,7 @@ def gather_planned_flows(session, enabled_flows, max_n=None):
     flows = (session.query(Flow)
              .where(Flow.state == "planned")
              .where(Flow.flow_type.in_(enabled_flows))
-             .order_by(Flow.is_backprocessing.asc(), Flow.priority.desc(), Flow.creation_time.desc())
+             .order_by(Flow.is_backprocessing.asc(), Flow.priority.desc(), Flow.creation_time.asc())
              .limit(max_n).all())
     count_per_type = defaultdict(lambda: 0)
     flow_ids = []