-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmiddleware_interface.py
More file actions
2037 lines (1789 loc) · 89.6 KB
/
middleware_interface.py
File metadata and controls
2037 lines (1789 loc) · 89.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# This file is part of prompt_processing.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (https://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
__all__ = ["get_central_butler", "make_local_repo", "make_local_cache",
"MiddlewareInterface"]
import collections.abc
import functools
import itertools
import logging
import os
import os.path
import re
import tempfile
import typing
import yaml
import astropy
import botocore.exceptions
import sqlalchemy.exc
import lsst.utils.timer
from lsst.resources import ResourcePath
import lsst.sphgeom
import lsst.afw.cameraGeom
from lsst.pipe.base.mp_graph_executor import MPGraphExecutor
from lsst.pipe.base.separable_pipeline_executor import SeparablePipelineExecutor
from lsst.pipe.base.single_quantum_executor import SingleQuantumExecutor
from lsst.daf.butler import Butler, CollectionType, DatasetType, DatasetRef, Timespan, \
DataIdValueError, DimensionRecord, MissingDatasetTypeError, MissingCollectionError
from lsst.daf.butler import Config as dafButlerConfig
import lsst.dax.apdb
import lsst.geom
import lsst.obs.base
import lsst.pipe.base
from lsst.pipe.base.quantum_graph_builder import QuantumGraphBuilderError
import lsst.analysis.tools
from lsst.analysis.tools.interfaces.datastore import SasquatchDispatcher, SasquatchDispatchFailure, \
SasquatchDispatchPartialFailure # Can't use fully-qualified names
from shared.config import PipelinesConfig
import shared.connect_utils as connect
import shared.run_utils as runs
from shared.visit import FannedOutVisit
from .caching import DatasetCache
from .exception import GracefulShutdownInterrupt, TimeoutInterrupt, NonRetriableError, RetriableError, \
InvalidPipelineError, NoGoodPipelinesError, PipelinePreExecutionError, PipelineExecutionError, \
ProvenanceDimensionsError
from .timer import enforce_schema, time_this_to_bundle
_log = logging.getLogger("lsst." + __name__)
_log.setLevel(logging.DEBUG)
# See https://developer.lsst.io/stack/logging.html#logger-trace-verbosity
_log_trace = logging.getLogger("TRACE1.lsst." + __name__)
_log_trace.setLevel(logging.CRITICAL) # Turn off by default.
_log_trace3 = logging.getLogger("TRACE3.lsst." + __name__)
_log_trace3.setLevel(logging.CRITICAL) # Turn off by default.
# The number of calib datasets to keep, including the current run.
base_keep_limit = int(os.environ.get("LOCAL_REPO_CACHE_SIZE", 3))
# Whether or not to export to the central repo.
do_export = bool(int(os.environ.get("DEBUG_EXPORT_OUTPUTS", '1')))
# The number of arcseconds to pad the region in preloading spatial datasets.
padding = float(os.environ.get("PRELOAD_PADDING", 30))
# The (jittered) number of seconds to delay retrying connections to the central Butler.
repo_retry = float(os.environ.get("REPO_RETRY_DELAY", 30))
# An optional file with local butler repo config overrides.
local_repo_config = os.environ.get("LOCAL_REPO_CONFIG", None)
# TODO: revisit which cases should be retried after DM-50934
# TODO: catch ButlerConnectionError once it's available
SQL_EXCEPTIONS = (sqlalchemy.exc.OperationalError, sqlalchemy.exc.InterfaceError)
DATASTORE_EXCEPTIONS = SQL_EXCEPTIONS + (botocore.exceptions.ClientError, )
@connect.retry(2, SQL_EXCEPTIONS, wait=repo_retry)
def get_central_butler(central_repo: str, instrument_class: str, writeable: bool):
"""Provide a Butler that can access the given repository and read and write
data for the given instrument.
Parameters
----------
central_repo : `str`
The path or URI to the central repository.
instrument_class : `str`
The name of the instrument whose data will be retrieved or written. May
be either the fully qualified class name or the short name.
writeable : `bool`
Whether or not it's safe to attempt writes to this Butler.
Returns
-------
butler : `lsst.daf.butler.Butler`
A Butler for ``central_repo`` pre-configured to load and store
``instrument_name`` data.
"""
return Butler(central_repo,
writeable=writeable,
inferDefaults=False,
)
def make_local_repo(local_storage: str, central_butler: Butler, instrument: str):
"""Create and configure a new local repository.
The repository is represented by a temporary directory object, which can be
used to manage its lifetime.
Parameters
----------
local_storage : `str`
An absolute path to a space where this function can create a local
Butler repo.
central_butler : `lsst.daf.butler.Butler`
Butler repo containing instrument and skymap definitions.
instrument : `str`
Name of the instrument taking the data, for populating
butler collections and dataIds. May be either the fully qualified class
name or the short name. Examples: "LsstCam", "lsst.obs.lsst.LsstCam".
Returns
-------
repo_dir
An object of the same type as returned by `tempfile.TemporaryDirectory`,
pointing to the local repo location.
"""
dimension_config = central_butler.dimensions.dimensionConfig
repo_dir = tempfile.TemporaryDirectory(dir=local_storage, prefix="butler-")
config = dafButlerConfig(local_repo_config)
butler = Butler(
Butler.makeRepo(repo_dir.name, config=config, dimensionConfig=dimension_config),
writeable=True,
)
_log.info("Created local Butler repo at %s with dimensions-config %s %d.",
repo_dir.name, dimension_config["namespace"], dimension_config["version"])
# Run-once repository initialization
instrument = lsst.obs.base.Instrument.from_string(instrument, central_butler.registry)
instrument.register(butler.registry)
butler.collections.register(instrument.makeUmbrellaCollectionName(), CollectionType.CHAINED)
butler.collections.register(instrument.makeDefaultRawIngestRunName(), CollectionType.RUN)
return repo_dir
def make_local_cache():
"""Set up a cache for preloaded datasets.
Returns
-------
cache : `activator.caching.DatasetCache`
An empty cache with configured caching strategy and limits.
"""
return DatasetCache(base_keep_limit)
def _get_sasquatch_dispatcher():
"""Get a SasquatchDispatcher object ready for use by Prompt Processing.
Returns
-------
dispatcher : `lsst.analysis.tools.interfaces.datastore.SasquatchDispatcher` \
or `None`
The object to handle all Sasquatch uploads from this module. If `None`,
the service is not configured to use Sasquatch.
"""
url = os.environ.get("SASQUATCH_URL", "")
if not url:
return None
token = os.environ.get("SASQUATCH_TOKEN", "")
namespace = os.environ.get("DAF_BUTLER_SASQUATCH_NAMESPACE", "lsst.prompt")
return SasquatchDispatcher(url=url, token=token, namespace=namespace)
GroupedDimensionRecords: typing.TypeAlias = dict[str, list[DimensionRecord]]
"""Dictionary from dimension name to list of dimension records for that
dimension.
"""
class ButlerWriter(typing.Protocol):
"""Interface defining functions for writing output datasets back to the central
Butler repository.
"""
def transfer_outputs(
self, local_butler: Butler, dimension_records: GroupedDimensionRecords, datasets: list[DatasetRef]
) -> list[DatasetRef]:
"""Transfer outputs back to the central repository.
Parameters
----------
local_butler : `lsst.daf.butler.Butler`
Local Butler repository from which output datasets will be
transferred.
dimension_records : `dict` [`str` , `list` [`lsst.daf.butler.DimensionRecord`]]
Dimension records to write to the central Butler repository.
datasets : `list` [`lsst.daf.butler.DatasetRef`]
Datasets to transfer to the central Butler repository.
Returns
-------
transferred : `list` [`lsst.daf.butler.DatasetRef`]
List of datasets actually transferred.
"""
class DirectButlerWriter(ButlerWriter):
def __init__(self, central_butler: Butler) -> None:
"""Writes Butler outputs back to the central repository by connecting
directly to the Butler database.
Parameters
----------
central_butler : `lsst.daf.butler.Butler`
Butler repo to which pipeline outputs should be written.
"""
self._central_butler = central_butler
def transfer_outputs(
self, local_butler: Butler, dimension_records: GroupedDimensionRecords, datasets: list[DatasetRef]
) -> list[DatasetRef]:
dimensions = local_butler.dimensions.sorted(dimension_records.keys())
for dimension in dimensions:
records = dimension_records[dimension.name]
# If records don't match, this is not an error, and central takes precedence.
self._central_butler.registry.insertDimensionData(dimension, *records, skip_existing=True)
return self._central_butler.transfer_from(
local_butler, datasets, transfer="copy", transfer_dimensions=False)
class MiddlewareInterface:
"""Interface layer between the Butler middleware and the prompt processing
data handling system, to handle processing individual images.
An instance of this class will accept an incoming group of single-detector
snaps to process, using an instance-local butler repo. The instance can
pre-load the necessary calibrations to process an incoming detector-visit,
ingest the data when it is available, and run the difference imaging
pipeline, all in that local butler.
Each instance must be used for processing only one group-detector
combination. The object may contain state that is unique to a particular
processing run.
``MiddlewareInterface`` objects are not thread- or process-safe. It is up
to the client to avoid conflicts from multiple objects trying to access the
same local repo.
Parameters
----------
read_butler : `lsst.daf.butler.Butler`
Butler repo containing the calibration and other data needed for
processing images as they are received. This butler must be created
with the default instrument and skymap assigned.
butler_writer : `activator.middleware_interface.ButlerWriter`
Object that will be used to write the pipeline outputs back to the
central Butler repository.
image_bucket : `str`
Storage bucket where images will be written to as they arrive.
See also ``prefix``.
visit : `shared.visit.FannedOutVisit`
The visit-detector combination to be processed by this object.
pre_pipelines : `shared.config.PipelinesConfig`
Information about which pipelines to run before a visit arrives.
main_pipelines : `shared.config.PipelinesConfig`
Information about which pipelines to run on ``visit``'s raws.
skymap : `str`
Name of the skymap in the central repo for querying templates.
local_repo : `str`
A URI to the local Butler repo, which is assumed to already exist and
contain standard collections and the registration of ``instrument``.
local_cache : `activator.caching.DatasetCache`
A cache holding datasets and usage history for preloaded dataset types
in ``local_repo``.
prefix : `str`, optional
URI scheme followed by ``://``; prepended to ``image_bucket`` when
constructing URIs to retrieve incoming files. The default is
appropriate for use in the USDF environment; typically only
change this when running local tests.
"""
DATASET_IDENTIFIER = "Live"
"""The dataset ID used for Sasquatch uploads.
"""
_collection_skymap = "skymaps"
"""The collection used for skymaps.
"""
@property
def _collection_template(self):
"""The collection used for templates.
This collection depends on initialization parameters, and must
not be called from this object's constructor.
"""
return self.instrument.makeCollectionName("templates")
# Class invariants:
# self.image_host is a valid URI with non-empty path and no query or fragment.
# self._download_store is None if and only if self.image_host is a local URI.
# self.visit, self.instrument, self.camera, self.skymap, self._deployment
# self._day_obs do not change after __init__.
# self.butler defaults to the "defaults" chained collection, which contains
# all pipeline inputs. However, self.butler is not
# guaranteed to contain concrete data, or even the dimensions
# corresponding to self.camera and self.skymap. Do not assume that
# self.butler is the only Butler pointing to the local repo.
def __init__(self, read_butler: Butler, butler_writer: ButlerWriter, image_bucket: str,
visit: FannedOutVisit,
pre_pipelines: PipelinesConfig, main_pipelines: PipelinesConfig,
# TODO: encapsulate relationship between local_repo and local_cache
skymap: str, local_repo: str, local_cache: DatasetCache,
prefix: str = "s3://"):
self.visit = visit
self._apdb_config = os.environ["CONFIG_APDB"]
# Deployment/version ID -- potentially expensive to generate.
self._deployment = runs.get_deployment(self._apdb_config)
self.read_central_butler = read_butler
self._butler_writer = butler_writer
self.image_host = prefix + image_bucket
# TODO: _download_store turns MWI into a tagged class; clean this up later
if not self.image_host.startswith("file"):
self._download_store = tempfile.TemporaryDirectory(prefix="holding-")
else:
self._download_store = None
# TODO: how much overhead do we pick up from going through the registry?
self.instrument = lsst.obs.base.Instrument.from_string(
visit.instrument, self.read_central_butler.registry)
self.pre_pipelines = pre_pipelines
self.main_pipelines = main_pipelines
now = astropy.time.Time.now()
self._day_obs = runs.get_day_obs(now)
self._init_local_butler(local_repo, [self.instrument.makeUmbrellaCollectionName()], None)
self._init_governor_datasets(now, skymap)
self.cache = local_cache # DO NOT copy -- we want to persist this state!
self._prep_collections()
self._define_dimensions()
self._init_ingester()
self._init_visit_definer()
self._init_provenance_dataset_type()
# How much to pad the spatial region we will copy over.
self.padding = padding*lsst.geom.arcseconds
def _init_local_butler(self, repo_uri: str, output_collections: list[str], output_run: str):
"""Prepare the local butler to ingest into and process from.
``self.butler`` is correctly initialized after this method returns.
Parameters
----------
repo_uri : `str`
A URI to the location of the local repository.
output_collections : `list` [`str`]
The collection(s) in which to search for inputs and outputs.
output_run : `str`
The run in which to place new pipeline outputs.
"""
# Internal Butler keeps a reference to the newly prepared collection.
# This reference makes visible any inputs for query purposes.
self.butler = Butler(repo_uri,
collections=output_collections,
run=output_run,
writeable=True,
)
def _init_ingester(self):
"""Prepare the raw file ingester to receive images into this butler.
``self._init_local_butler`` must have already been run.
"""
config = lsst.obs.base.RawIngestConfig()
self.instrument.applyConfigOverrides(lsst.obs.base.RawIngestTask._DefaultName, config)
config.transfer = "copy" # Copy files into the local butler.
config.failFast = True # We want failed ingests to fail immediately.
self.rawIngestTask = lsst.obs.base.RawIngestTask(config=config,
butler=self.butler)
def _init_visit_definer(self):
"""Prepare the visit definer to define visits for this butler.
``self._init_local_butler`` must have already been run.
"""
define_visits_config = lsst.obs.base.DefineVisitsConfig()
self.instrument.applyConfigOverrides(lsst.obs.base.DefineVisitsTask._DefaultName,
define_visits_config)
define_visits_config.groupExposures = "one-to-one"
self.define_visits = lsst.obs.base.DefineVisitsTask(config=define_visits_config, butler=self.butler)
@connect.retry(2, DATASTORE_EXCEPTIONS, wait=repo_retry)
def _init_governor_datasets(self, timestamp, skymap):
"""Load and store the camera and skymap for later use.
``self._init_local_butler`` must have already been run.
Parameters
----------
timestamp : `astropy.time.Time`
The time at which the camera must be valid.
skymap : `str`
The name of the skymap to load.
"""
# Camera is time-dependent, in principle, and may be available only
# through a calibration collection.
camera_ref = self.read_central_butler.find_dataset(
"camera",
instrument=self.instrument.getName(),
collections=self.instrument.makeCalibrationCollectionName(),
timespan=Timespan.fromInstant(timestamp)
)
self.camera = self.read_central_butler.get(camera_ref)
self.skymap_name = skymap
self.skymap = self.read_central_butler.get("skyMap", skymap=self.skymap_name,
collections=self._collection_skymap)
def _init_provenance_dataset_type(self):
"""Register the dataset types used to store provenance information.
``self._init_local_butler`` must have already been run.
"""
self._provenance_dataset_type = DatasetType(
"prompt_provenance",
self.butler.dimensions.conform(["group", "detector"]),
"ProvenanceQuantumGraph",
)
self.butler.registry.registerDatasetType(self._provenance_dataset_type)
def _define_dimensions(self):
"""Define any dimensions that must be computed from this object's visit.
``self._init_local_butler`` must have already been run.
"""
self.butler.registry.syncDimensionData("group",
{"name": self.visit.groupId,
"instrument": self.instrument.getName(),
})
def _cache_datasets(self, refs: collections.abc.Iterable[lsst.daf.butler.DatasetRef]):
"""Add or mark requested datasets in the cache.
Parameters
----------
refs : iterable [`lsst.daf.butler.DatasetRef`]
The datasets to cache. Assumed to all fit inside the cache.
"""
evicted = self.cache.update(refs)
self.butler.pruneDatasets(evicted, disassociate=True, unstore=True, purge=True)
try:
self.cache.access(refs)
except LookupError as e:
raise RuntimeError("Cache is too small for one run's worth of datasets.") from e
def _pad_region(self,
initial_region: lsst.sphgeom.Region,
wcs: lsst.afw.geom.SkyWcs,
) -> lsst.sphgeom.Region:
"""Pad the expected footprint to allow for slew errors.
This method emits a warning if the preload padding is too small.
Parameters
----------
initial_region : `lsst.sphgeom.Region`
The unpadded region to expand.
wcs : `lsst.afw.geom.SkyWcs`
A WCS for the current image. Only needs to be good enough to get
the plate scale.
Returns
-------
region : `lsst.sphgeom.Region`
The padded region.
Raises
------
TypeError
Raised if padding is not supported for ``initial_region``.
"""
# Compare the preload region padding versus the visit region padding
# in the middleware visit definition.
visit_definition_padding = (
self.define_visits.config.computeVisitRegions["single-raw-wcs"].padding
* wcs.getPixelScale().asArcseconds()
)
preload_region_padding = self.padding.asArcseconds()
if preload_region_padding < visit_definition_padding:
_log.warning("Preload padding (%.1f arcsec) is smaller than "
"visit definition's region padding (%.1f arcsec).",
preload_region_padding, visit_definition_padding)
if isinstance(initial_region, lsst.sphgeom.ConvexPolygon):
center = lsst.geom.SpherePoint(initial_region.getCentroid())
corners = [lsst.geom.SpherePoint(c) for c in initial_region.getVertices()]
padded = [c.offset(center.bearingTo(c), self.padding) for c in corners]
return lsst.sphgeom.ConvexPolygon.convexHull([c.getVector() for c in padded])
elif isinstance(initial_region, lsst.sphgeom.Circle):
return lsst.sphgeom.Circle(initial_region.getCenter(),
initial_region.getOpeningAngle() + self.padding)
else:
raise TypeError(f"Cannot pad region {initial_region!r}.")
def prep_butler(self) -> None:
"""Prepare a temporary butler repo for processing the incoming data.
After this method returns, the internal butler is guaranteed to contain
all data and all dimensions needed to run the appropriate pipeline on
this object's visit, except for ``raw`` and the ``exposure`` and
``visit`` dimensions, respectively. It may contain other data that would
not be loaded when processing the visit.
"""
action_id = "prepButlerTimeMetric" # For consistency with analysis_tools outputs
bundle = lsst.analysis.tools.interfaces.MetricMeasurementBundle(
dataset_identifier=self.DATASET_IDENTIFIER,
)
with time_this_to_bundle(bundle, action_id, "prep_butlerTotalTime"):
with lsst.utils.timer.time_this(_log, msg="prep_butler", level=logging.DEBUG):
_log.info(f"Preparing Butler for visit {self.visit!r}")
wcs = self.visit.predict_wcs(self.camera)
if wcs:
region = self._pad_region(self.visit.get_detector_icrs_region(self.camera), wcs)
_log.debug(
f"Preload region {region} including padding {self.padding.asArcseconds()} arcsec.")
self._write_region_time(region) # Must be done before preprocessing pipeline
else:
_log.warning("Could not get sky position from visit %s. "
"Spatial datasets won't be loaded.", self.visit)
region = None
with time_this_to_bundle(bundle, action_id, "prep_butlerSearchTime"):
all_datasets, calib_datasets = self._find_data_to_preload(region)
with time_this_to_bundle(bundle, action_id, "prep_butlerTransferTime"):
self._transfer_data(all_datasets, calib_datasets)
with time_this_to_bundle(bundle, action_id, "prep_butlerPreprocessTime"):
try:
self._run_preprocessing()
except NoGoodPipelinesError:
_log.exception("Preprocessing pipelines not runnable, trying main pipelines anyway.")
except (PipelinePreExecutionError, PipelineExecutionError):
_log.exception("Preprocessing pipeline failed, trying main pipelines anyway.")
# IMPORTANT: do not remove or rename entries in this list. New entries can be added as needed.
enforce_schema(bundle, {action_id: ["prep_butlerTotalTime",
"prep_butlerSearchTime",
"prep_butlerTransferTime",
"prep_butlerPreprocessTime",
]})
self.butler.registry.registerDatasetType(DatasetType(
"promptPreload_metrics",
dimensions={"instrument", "group", "detector"},
storageClass="MetricMeasurementBundle",
universe=self.butler.dimensions,
))
self.butler.put(bundle,
"promptPreload_metrics",
run=runs.get_preload_run(self.instrument, self._deployment, self._day_obs),
instrument=self.instrument.getName(),
detector=self.visit.detector,
group=self.visit.groupId)
@connect.retry(2, SQL_EXCEPTIONS, wait=repo_retry)
def _find_data_to_preload(self, region):
"""Identify the datasets to export from the central repo.
The returned datasets are a superset of those needed by any pipeline,
but exclude any datasets that are already present in the local repo.
Parameters
----------
region : `lsst.sphgeom.Region` or None
The region to find data to preload.
Returns
-------
datasets : set [`~lsst.daf.butler.DatasetRef`]
The datasets to be exported, after any filtering.
calibs : set [`~lsst.daf.butler.DatasetRef`]
The subset of ``datasets`` representing calibs.
"""
net_types = set().union(*self._get_preloadable_types().values())
# Filter outputs made by preprocessing and consumed by main.
for pipeline_file in self.get_pre_pipeline_files():
net_types.difference_update(self._get_pipeline_output_types(pipeline_file))
with lsst.utils.timer.time_this(_log, msg="prep_butler (find init-outputs)", level=logging.DEBUG):
all_datasets = set(self._find_init_outputs())
calib_datasets = set()
present_types = net_types.copy()
with lsst.utils.timer.time_this(_log, msg="prep_butler (find inputs)", level=logging.DEBUG):
for type_name in net_types:
dstype = self.read_central_butler.get_dataset_type(type_name)
try:
if dstype.isCalibration():
new_calibs = self._find_calibs(dstype, self.visit.detector, self.visit.filters)
calib_datasets.update(new_calibs)
all_datasets.update(new_calibs)
elif "htm7" in dstype.dimensions or "skypix" in dstype.dimensions:
if region is not None:
all_datasets.update(self._find_refcats(dstype, region))
elif "tract" in dstype.dimensions:
if region is not None:
all_datasets.update(self._find_templates(dstype, region, self.visit.filters))
else:
all_datasets.update(self._find_generic_datasets(
dstype, self.visit.detector, self.visit.filters))
except _MissingDatasetError:
_log.warning("Found no source datasets of type %s.", type_name)
present_types.remove(type_name)
if self._is_main_pipeline_runnable(present_types):
return (all_datasets, calib_datasets)
else:
raise NoGoodPipelinesError("Cannot run any main pipeline.")
def _get_preloadable_types(self):
"""Identify all types to attempt to preload.
Returns
-------
types : mapping [`str`, set [`str`]]
A mapping from each pipeline's path to the types to preload for
that pipeline.
"""
input_types = {}
for pipeline_file in self.get_combined_pipeline_files():
inputs = self._get_pipeline_input_types(pipeline_file)
# Not preloaded
inputs.discard("regionTimeInfo")
inputs.discard("raw")
input_types[pipeline_file] = inputs
return input_types
def _is_main_pipeline_runnable(self, present_types):
"""Determine if at least one pipeline can be run with the available data.
This method emits diagnostic logs as a side effect.
Parameters
----------
present_types : set [`str`]
The types that are accounted for, either already present in the
local repo or marked for download.
Returns
-------
runnable : `bool`
`True` if and only if at least one pipeline has all inputs.
"""
pre_outputs = set()
for pipeline_file in self.get_pre_pipeline_files():
input_types = self._get_pipeline_input_types(pipeline_file, include_optional=False)
input_types.discard("regionTimeInfo")
if input_types <= present_types:
_log.debug("Found inputs for %s.", pipeline_file)
pre_outputs.update(self._get_pipeline_output_types(pipeline_file))
else:
_log.debug("Missing inputs for %s: %s.", pipeline_file, input_types - present_types)
main_inputs = present_types | pre_outputs
for pipeline_file in self.get_main_pipeline_files():
input_types = self._get_pipeline_input_types(pipeline_file, include_optional=False)
input_types.discard("regionTimeInfo")
input_types.discard("raw")
if input_types <= main_inputs:
_log.debug("Found inputs for %s.", pipeline_file)
return True
else:
_log.debug("Missing inputs for %s: %s.", pipeline_file, input_types - main_inputs)
return False
def _get_pipeline_input_types(self, pipeline_file, include_optional=True):
"""Identify the dataset types needed as inputs for a pipeline.
Parameters
----------
pipeline_file : `str`
The pipeline whose inputs are desired.
include_optional : `bool`, optional
Whether to report optional inputs (the default) or only required
ones.
Returns
-------
input_types : set [`str`]
The types of preexisting datasets needed to run the pipeline.
"""
try:
pipeline = self._prep_pipeline_graph(pipeline_file)
except FileNotFoundError as e:
raise RuntimeError(f"Could not find pipeline {pipeline_file}.") from e
if include_optional:
task_inputs = {edge.parent_dataset_type_name
for task in pipeline.tasks.values() for edge in task.iter_all_inputs()}
else:
task_inputs = {edge.parent_dataset_type_name
for task in pipeline.tasks.values() for edge in task.iter_all_inputs()
if not task.is_optional(edge.key[2])}
# Ignore inputs produced internally.
task_outputs = {edge.parent_dataset_type_name
for task in pipeline.tasks.values() for edge in task.iter_all_outputs()}
return task_inputs - task_outputs
def _get_pipeline_output_types(self, pipeline_file):
"""Identify the dataset types produced as outputs by a pipeline.
Parameters
----------
pipeline_file : `str`
The pipeline whose inputs are desired.
Returns
-------
input_types : set [`str`]
The types of preexisting datasets needed to run the pipeline.
"""
try:
pipeline = self._prep_pipeline_graph(pipeline_file)
except FileNotFoundError as e:
raise RuntimeError(f"Could not find pipeline {pipeline_file}.") from e
return {edge.parent_dataset_type_name
for task in pipeline.tasks.values() for edge in task.iter_all_outputs()}
def _find_refcats(self, dataset_type, region):
"""Identify the refcats to export from the central butler.
Parameters
----------
dataset_type : `lsst.daf.butler.DatasetType`
The type of refcat to search for.
region : `lsst.sphgeom.Region`
The region to find refcat shards in.
Returns
-------
refcats : iterable [`lsst.daf.butler.DatasetRef`]
The refcats to be exported, after any filtering.
"""
# Get shards from all refcats that overlap this region.
refcats = set(_filter_datasets(
self.read_central_butler, self.butler,
_generic_query([dataset_type],
collections=self.instrument.makeRefCatCollectionName(),
where="htm7.region OVERLAPS search_region",
bind={"search_region": region},
find_first=True,
),
# Don't cache refcats
))
if refcats:
_log.debug("Found %d new refcat datasets from catalog '%s'.", len(refcats), dataset_type.name)
return refcats
def _find_templates(self, dataset_type, region, physical_filter):
"""Identify the templates to export from the central butler.
Parameters
----------
dataset_type : `lsst.daf.butler.DatasetType`
The type of template to search for.
region : `lsst.sphgeom.Region`
The region to load the templates tract/patches for.
physical_filter : `str`
Physical filter for which to export templates. May be empty to
indicate no specific filter.
Returns
-------
templates : iterable [`lsst.daf.butler.DatasetRef`]
The datasets to be exported, after any filtering.
"""
if not physical_filter:
_log.warning("Preloading templates is not supported for visits without a specific filter.")
return set()
data_id = {"instrument": self.instrument.getName(),
"skymap": self.skymap_name,
"physical_filter": physical_filter,
}
templates = set(_filter_datasets(
self.read_central_butler, self.butler,
_generic_query([dataset_type],
collections=self._collection_template,
data_id=data_id,
where="patch.region OVERLAPS search_region",
bind={"search_region": region},
find_first=True,
),
# Don't cache templates
))
if templates:
_log.debug("Found %d new template datasets of type %s.", len(templates), dataset_type.name)
return templates
def _find_calibs(self, dataset_type, detector_id, physical_filter):
"""Identify the calibs to export from the central butler.
Parameters
----------
dataset_type : `lsst.daf.butler.DatasetType`
The type of calib to search for.
detector_id : `int`
Identifier of the detector to load calibs for.
physical_filter : `str`
Physical filter name of the upcoming visit. May be empty to indicate
no specific filter.
Returns
-------
calibs : iterable [`lsst.daf.butler.DatasetRef`]
The calibs to be exported, after any filtering.
"""
# TAI observation start time should be used for calib validity range.
calib_date = astropy.time.Time(self.visit.private_sndStamp, format="unix_tai")
data_id = {"instrument": self.instrument.getName(), "detector": detector_id}
if physical_filter:
data_id["physical_filter"] = physical_filter
elif "physical_filter" in dataset_type.dimensions or "band" in dataset_type.dimensions:
_log.warning("Preloading filter-dependent calibs is not supported for visits "
"without a specific filter.")
return set()
def query_calibs_by_date(butler, label):
with butler.query() as query:
expr = query.expression_factory
query = query.where(data_id)
try:
datasets = set(
query.datasets(dataset_type,
self.instrument.makeCalibrationCollectionName(),
find_first=True)
# where needs to come after datasets to pick up the type
.where(expr[dataset_type.name].timespan.overlaps(calib_date))
.with_dimension_records()
)
except (DataIdValueError, MissingDatasetTypeError, MissingCollectionError) as e:
# Dimensions/dataset type often invalid for fresh local repo,
# where there are no, and never have been, any matching datasets.
# May have dimensions but no collections if a previous preload failed.
# These *are* a problem for the central repo, but can be caught later.
_log.debug("%s query failed with %s.", label, e)
datasets = set()
# Trace3 because, in many contexts, datasets is too large to print.
_log_trace3.debug("%s: %s", label, datasets)
return datasets
calibs = set(_filter_datasets(
self.read_central_butler, self.butler,
query_calibs_by_date,
all_callback=self._cache_datasets,
))
if calibs:
_log.debug("Found %d new calib datasets of type '%s'.", len(calibs), dataset_type.name)
return calibs
def _find_generic_datasets(self, dataset_type, detector_id, physical_filter):
"""Identify datasets to export from the central butler.
Parameters
----------
dataset_type : `lsst.daf.butler.DatasetType`
The type of dataset to search for.
detector_id : `int`
Identifier of the detector to load calibs for.
physical_filter : `str`
Physical filter name of the upcoming visit. May be empty to indicate
no specific filter.
Returns
-------
datasets : iterable [`lsst.daf.butler.DatasetRef`]
The datasets to be exported, after any filtering.
"""
data_id = {"instrument": self.instrument.getName(),
"skymap": self.skymap_name,
"detector": detector_id,
}
if physical_filter:
data_id["physical_filter"] = physical_filter
datasets = set(_filter_datasets(
self.read_central_butler, self.butler,
_generic_query([dataset_type],
collections=self.instrument.makeUmbrellaCollectionName(),
data_id=data_id,
find_first=True,
),
all_callback=self._cache_datasets,
))
if datasets:
_log.debug("Found %d new datasets of type %s.", len(datasets), dataset_type.name)
return datasets
def _get_init_output_types(self, pipeline_file):
"""Identify the specific init-output types to query.
Parameters
----------
pipeline_file : `str`
The pipeline of interest.
Returns
-------
init_types : collection [`str`]
The init-output types of interest to Prompt Processing.
"""
try:
pipeline = self._prep_pipeline_graph(pipeline_file)
except FileNotFoundError as e:
raise RuntimeError(f"Could not find pipeline {pipeline_file}.") from e
return {edge.parent_dataset_type_name
for task in pipeline.tasks.values() for edge in task.init.iter_all_outputs()}
def _find_init_outputs(self):
"""Identify the init-output datasets to export from the central butler.
Returns
-------
init_outputs : iterable [`lsst.daf.butler.DatasetRef`]
The datasets to be exported.
"""
datasets = set()
for pipeline_file in self.get_combined_pipeline_files():
run = runs.get_output_run(self.instrument, self._deployment, pipeline_file, self._day_obs)
types = self._get_init_output_types(pipeline_file)
# Output runs are always cleared after execution, so _filter_datasets would always warn.
# This also means the init-outputs don't need to be cached with _cache_datasets.
query = _generic_query(types, collections=run)
datasets.update(query(self.read_central_butler, "source datasets"))
if not datasets:
raise _MissingDatasetError("Source repo query found no matches.")
for run, n_datasets in self._count_by_key(datasets, lambda ref: ref.run):
_log.debug("Found %d new init-output datasets from %s.", n_datasets, run)
return datasets
@connect.retry(2, DATASTORE_EXCEPTIONS, wait=repo_retry)
def _transfer_data(self, datasets, calibs):
"""Transfer datasets and all associated collections from the central
repo to the local repo.
Parameters
----------
datasets : set [`~lsst.daf.butler.DatasetRef`]
The datasets to transfer into the local repo.
calibs : set [`~lsst.daf.butler.DatasetRef`]
The calibs to re-certify into the local repo.
"""
with lsst.utils.timer.time_this(_log, msg="prep_butler (transfer datasets)", level=logging.DEBUG):
transferred = self.butler.transfer_from(self.read_central_butler,
datasets,
transfer="copy",
skip_missing=True,
register_dataset_types=True,
transfer_dimensions=True,
)
missing = _check_transfer_completion(datasets, transferred, "Downloaded")
with lsst.utils.timer.time_this(_log, msg="prep_butler (transfer collections)", level=logging.DEBUG):
self._export_collections(self._collection_template)
self._export_collections(self.instrument.makeUmbrellaCollectionName())
with lsst.utils.timer.time_this(_log, msg="prep_butler (transfer associations)", level=logging.DEBUG):
self._export_calib_associations(self.instrument.makeCalibrationCollectionName(),
calibs - missing)
# Temporary workarounds until we have a prompt-processing default top-level collection
# in shared repos, and raw collection in dev repo, and then we can organize collections
# without worrying about DRP use cases.