chore: refactor resolve_collections method

tcjennings · tcjennings · commit bba2f7ad018c · 2025-04-21T15:23:25.000-05:00
diff --git a/docs/BUTLER.md b/docs/BUTLER.md
@@ -50,3 +50,35 @@ The `PGPASSFILE` value is constructed using the `config.htcondor.remote_user_hom
 
 > [!NOTE]
 > Secrets for second-party Butlers may also be provided via an environment variable. By setting `LSST_DB_AUTH_CREDENTIALS` with the JSON string representation of a `db-auth.yaml` file, all dependencies on presumed filesystem objects in the submission environment are resolved.
+
+## Butler Collection Management
+
+CM Service creates tagged and chained Butler collections during its runtime:
+
+### Preflight
+During Campaign preflight, three Butler collection operations are called. This happens before any steps, groups, or jobs are created or executed.
+
+1. A *tagged* collection is made from the Campaign's `collection.campaign_source` setting, constrained by the Campaign's `data.data_query` setting.
+1. A *chained* collection is made from the Campaign's `collection.campaign_ancillary_inputs` setting.
+1. A *chained* collection is made from the previous two collections.
+
+The final chained collection is used as an *input* collection for all subsequent Campaign step operations, i.e., it is identified as part of the `payload.inCollection` for any BPS workflow files generated by the Campaign.
+
+### Stepwise Processing
+During Campaign stepwise processing, each Step in the Campaign includes Butler collection operations:
+
+1. A step-specific *chained* collection is made from Campaign input collection and applied to the `payload.inCollection` parameter.
+1. A step-group-specific *run* collection is made as a side effect of executing the step, named as indicated by the Group's `payload.outputRun` BPS Workflow parameter.
+1. A step-specific *chained* collection is made from the set of *run* collections generated by the step-groups.
+
+> [!Note]
+> During Stepwise processing, the BPS Workflow `payload.dataQuery` is populated according to the Step's `child_config.base_query` parameter and modified according to any Group splitting algorithm applied to the Step; it is not affected by the `data.data_query` at the Campaign level.
+
+> [!Note]
+> Presumably, if the *tagged* Campaign input collection was constrained by a meaningful data query, then that query does not need to be repeated in the Stepwise consideration of Butler collections, and only the result of group-split algorithms is necessary. However, this means that any out-of-band observer of a CM-generated BPS workflow file will not understand the nature of the input collection and its interaction with the data query without cross-referencing, so insofar as it improves understandability, the workflow file should be as comprehensively detailed as possible, even if doing so is redudant.
+
+### Postflight
+During Campaign postflight, Butler collection operations are used to further chain together Campaign elements, eventually resulting in a single *chained* collection for the entire Campaign.
+
+1. Each step-specific *chained* collection is itself chained to a Campaign *chained* "output" collection.
+1. The final *chained* collection, named according to the Campaign's `collection.out` parameter, includes the Campaign "output" collection, the Campaign "input" collection, and the Campaign "resource_usage" collection.
diff --git a/src/lsst/cmservice/db/node.py b/src/lsst/cmservice/db/node.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import re
+from collections import ChainMap, defaultdict
 from typing import TYPE_CHECKING, Any
 
 from sqlalchemy.exc import IntegrityError
@@ -201,10 +203,9 @@ async def resolve_collections(
 
         Notes
         -----
-        This will return a dict with all of the collections
-        templated defined for this node resovled using
-        collection aliases and collection templates
-        defined up the processing heirarchy
+        This will return a dict with all of the collections templates defined
+        for this node resolved using collection aliases and collection templ-
+        ates defined up the processing hierarchy
 
         Parameters
         ----------
@@ -219,47 +220,31 @@ async def resolve_collections(
         resolved_collections: dict
             Resolved collection names
         """
-        my_collections = await NodeMixin.get_collections(self, session)
+        raw_collections: dict[str, str | list[str]] = await NodeMixin.get_collections(self, session)
         collection_dict = await self.get_collections(session)
         name_dict = self._split_fullname(self.fullname)
-        name_dict["out"] = collection_dict.pop("out")
-        resolved_collections: dict = {}
-        for name_, val_ in my_collections.items():
-            if isinstance(val_, list):  # pragma: no cover
-                # FIXME, see if this is now being tested
-                resolved_collections[name_] = []
-                # FIXME disambiguate what types val_, item_ and f1 are supposed
-                #       to be
-                for item_ in val_:
-                    try:
-                        f1 = item_.format(**collection_dict)
-                    except KeyError:
-                        f1 = val_
-                    try:
-                        resolved_collections[name_].append(f1.format(**name_dict))
-                    except KeyError as e:
-                        raise CMResolveCollectionsError(
-                            f"Failed to resolve collection {name_} {f1} using: {name_dict!s}",
-                        ) from e
-                resolved_collections[name_] = ",".join(resolved_collections[name_])
-            else:
-                try:
-                    f1 = val_.format(**collection_dict)
-                except KeyError:
-                    f1 = val_
-                try:
-                    resolved_collections[name_] = f1.format(**name_dict)
-                except KeyError as msg:
-                    raise CMResolveCollectionsError(
-                        f"Failed to resolve collection {name_}, {f1} using: {name_dict!s}",
-                    ) from msg
+        lookup_chain = ChainMap(collection_dict, name_dict, defaultdict(lambda: "MUST_OVERRIDE"))
+
+        resolved_collections = {
+            k: (v if isinstance(v, str) else ",".join(v)) for k, v in raw_collections.items()
+        }
+
+        # It may take multiple passes to format all the placeholder
+        # tokens in the collection strings, repeat the formatting until no such
+        # tokens remain.
+        while unresolved_collections := {
+            k: v for k, v in resolved_collections.items() if re.search("{.*}", v)
+        }:
+            for k, v in unresolved_collections.items():
+                resolved_collections[k] = v.format_map(lookup_chain)
+
         if throw_overrides:
-            for key, value in resolved_collections.items():
-                if "MUST_OVERRIDE" in value:  # pragma: no cover
-                    raise CMResolveCollectionsError(
-                        f"Attempts to resolve {key} collection includes MUST_OVERRIDE. Make sure to provide "
-                        "necessary collection names."
-                    )
+            if [v for v in resolved_collections.values() if re.search("MUST_OVERRIDE", v)]:
+                raise CMResolveCollectionsError(
+                    "Attempts to resolve collection includes MUST_OVERRIDE. Make sure to provide "
+                    "necessary collection names."
+                )
+
         return resolved_collections
 
     async def get_collections(