Skip to content

Commit 66ddefb

Browse files
authored
Override data_type from dataset (#376)
* Don't always run coverage after tests * Always override the data_type from the dataset It is crucial that the data type always exactly corresponds to the physical dataset, so we ensure that it is overridden every time we work with metadata. As part of this change we also perform dataset consistency checks, even when we're not merging. This will assist in maintaining consistency between dataset and metadata. * Fix logic * Create test datasets on the fly * Fix tests * Fix mypy * Add test for overriding data types
1 parent 9c92711 commit 66ddefb

23 files changed

+739
-207
lines changed

noxfile.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -73,20 +73,16 @@ def mypy(session: nox.Session) -> None:
7373
def tests(session: nox.Session) -> None:
7474
"""Run the test suite."""
7575
install_with_uv(session, groups=["test"])
76-
try:
77-
session.run(
78-
"coverage",
79-
"run",
80-
"--parallel",
81-
"-m",
82-
"pytest",
83-
"-o",
84-
"pythonpath=",
85-
*session.posargs,
86-
)
87-
finally:
88-
if session.interactive:
89-
session.notify("coverage", posargs=[])
76+
session.run(
77+
"coverage",
78+
"run",
79+
"--parallel",
80+
"-m",
81+
"pytest",
82+
"-o",
83+
"pythonpath=",
84+
*session.posargs,
85+
)
9086

9187

9288
@nox.session(python=python_versions[-1], default=False)

src/dapla_metadata/datasets/_merge.py

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
make changes as appropriate.
1010
"""
1111

12-
import copy
1312
import logging
1413
import warnings
1514
from collections.abc import Iterable
@@ -42,7 +41,6 @@
4241
)
4342
VARIABLE_RENAME_MESSAGE = "Variables have been renamed in the dataset"
4443
VARIABLE_ORDER_MESSAGE = "The order of variables in the dataset has changed"
45-
VARIABLE_DATATYPES_MESSAGE = "Variable datatypes differ"
4644
VARIABLES_FEWER_MESSAGE = "Dataset has fewer variables than defined in metadata"
4745

4846

@@ -164,13 +162,6 @@ def check_variables_consistency(
164162
== [v.short_name or "" for v in existing_variables],
165163
)
166164
)
167-
results.append(
168-
DatasetConsistencyStatus(
169-
message=VARIABLE_DATATYPES_MESSAGE,
170-
success=[v.data_type for v in extracted_variables]
171-
== [v.data_type for v in existing_variables],
172-
)
173-
)
174165
else:
175166
results.extend(
176167
[
@@ -252,6 +243,7 @@ def merge_variables(
252243
existing_metadata: OptionalDatadocMetadataType,
253244
extracted_metadata: all_optional_model.DatadocMetadata,
254245
merged_metadata: all_optional_model.DatadocMetadata,
246+
explicitly_defined_metadata_document: bool = True,
255247
) -> all_optional_model.DatadocMetadata:
256248
"""Merges variables from the extracted metadata into the existing metadata and updates the merged metadata.
257249
@@ -264,6 +256,9 @@ def merge_variables(
264256
existing_metadata: The metadata object containing the current state of variables.
265257
extracted_metadata: The metadata object containing new or updated variables to merge.
266258
merged_metadata: The metadata object that will contain the result of the merge.
259+
explicitly_defined_metadata_document: True when the user has supplied a path to a metadata document in addition to
260+
the dataset. This is done when re-using metadata from another dataset for convenience. There are some differences
261+
in behaviour in this case.
267262
268263
Returns:
269264
all_optional_model.DatadocMetadata: The `merged_metadata` object containing variables from both `existing_metadata`
@@ -277,24 +272,26 @@ def merge_variables(
277272
and merged_metadata.variables is not None
278273
):
279274
for extracted in extracted_metadata.variables:
280-
existing = next(
275+
if existing := next(
281276
(
282277
existing
283278
for existing in existing_metadata.variables
284279
if existing.short_name == extracted.short_name
285280
),
286281
None,
287-
)
288-
if existing:
289-
existing.id = (
290-
None # Set to None so that it will be set assigned a fresh ID later
291-
)
282+
):
283+
if explicitly_defined_metadata_document:
284+
# In this case we're transferring metadata to a new dataset so we must
285+
# assign a new ID
286+
existing.id = None # Set to None so that it will be set assigned a fresh ID later
292287
existing.contains_data_from = (
293288
extracted.contains_data_from or existing.contains_data_from
294289
)
295290
existing.contains_data_until = (
296291
extracted.contains_data_until or existing.contains_data_until
297292
)
293+
# We must ensure that the data type always corresponds to the dataset.
294+
existing.data_type = extracted.data_type
298295
merged_metadata.variables.append(
299296
cast("datadoc_model.all_optional.model.Variable", existing)
300297
)
@@ -307,31 +304,60 @@ def merge_variables(
307304
def merge_metadata(
308305
extracted_metadata: all_optional_model.DatadocMetadata | None,
309306
existing_metadata: OptionalDatadocMetadataType,
307+
explicitly_defined_metadata_document: bool = True,
310308
) -> all_optional_model.DatadocMetadata:
311-
if not existing_metadata:
309+
"""Merge metadata extracted from a dataset with existing metadata from a metadata document.
310+
311+
There are two cases this function can handle:
312+
1. When the user has explicitly supplied the path to another metadata document. This is
313+
convenience functionality to allow metadata to be reused and copied from one dataset
314+
to another. In this case there is an explicit list of fields which are to be copied
315+
over, defined in `DATASET_FIELDS_FROM_EXISTING_METADATA`. We also want to create new
316+
IDs for the dataset and all variables.
317+
318+
Args:
319+
existing_metadata: The metadata object containing the current state of variables.
320+
extracted_metadata: The metadata object containing new or updated variables to merge.
321+
merged_metadata: The metadata object that will contain the result of the merge.
322+
explicitly_defined_metadata_document: True when the user has supplied a path to a metadata document in addition to
323+
the dataset. This is done when re-using metadata from another dataset for convenience. There are some differences
324+
in behaviour in this case.
325+
326+
Returns:
327+
all_optional_model.DatadocMetadata: The `merged_metadata` resulting from merging `existing_metadata`
328+
and `extracted_metadata`.
329+
"""
330+
if not existing_metadata or not existing_metadata.dataset:
312331
logger.warning(
313332
"No existing metadata found, no merge to perform. Continuing with extracted metadata.",
314333
)
315334
return extracted_metadata or all_optional_model.DatadocMetadata()
316335

317-
if not extracted_metadata:
336+
if not extracted_metadata or not extracted_metadata.dataset:
318337
return cast("all_optional_model.DatadocMetadata", existing_metadata)
319338

320-
# Use the extracted metadata as a base
321-
merged_metadata = all_optional_model.DatadocMetadata(
322-
dataset=copy.deepcopy(extracted_metadata.dataset),
323-
variables=[],
324-
)
325-
326-
override_dataset_fields(
327-
merged_metadata=merged_metadata,
328-
existing_metadata=existing_metadata,
329-
)
339+
if explicitly_defined_metadata_document:
340+
# Use the extracted metadata as a base
341+
merged_metadata = all_optional_model.DatadocMetadata(
342+
dataset=extracted_metadata.dataset.model_dump(),
343+
variables=[],
344+
)
345+
override_dataset_fields(
346+
merged_metadata=merged_metadata,
347+
existing_metadata=existing_metadata,
348+
)
349+
else:
350+
# Use the existing metadata as a base
351+
merged_metadata = all_optional_model.DatadocMetadata(
352+
dataset=existing_metadata.dataset.model_dump(),
353+
variables=[],
354+
)
330355

331356
# Merge variables.
332357
# For each extracted variable, copy existing metadata into the merged metadata
333358
return merge_variables(
334359
existing_metadata=existing_metadata,
335360
extracted_metadata=extracted_metadata,
336361
merged_metadata=merged_metadata,
362+
explicitly_defined_metadata_document=explicitly_defined_metadata_document,
337363
)

src/dapla_metadata/datasets/core.py

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
from dapla_metadata.datasets.utility.constants import (
3636
DEFAULT_SPATIAL_COVERAGE_DESCRIPTION,
3737
)
38-
from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
3938
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_DATASET_FIELDS
4039
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
4140
from dapla_metadata.datasets.utility.urn import convert_uris_to_urns
@@ -45,6 +44,8 @@
4544
from dapla_metadata.datasets.utility.utils import PseudonymizationType
4645
from dapla_metadata.datasets.utility.utils import VariableListType
4746
from dapla_metadata.datasets.utility.utils import VariableType
47+
from dapla_metadata.datasets.utility.utils import build_dataset_path
48+
from dapla_metadata.datasets.utility.utils import build_metadata_document_path
4849
from dapla_metadata.datasets.utility.utils import calculate_percentage
4950
from dapla_metadata.datasets.utility.utils import derive_assessment_from_state
5051
from dapla_metadata.datasets.utility.utils import get_timestamp_now
@@ -143,7 +144,7 @@ def __init__(
143144
if dataset_path:
144145
self.dataset_path = UPath(dataset_path)
145146
if not metadata_document_path:
146-
self.metadata_document = self.build_metadata_document_path(
147+
self.metadata_document = build_metadata_document_path(
147148
self.dataset_path,
148149
)
149150
if metadata_document_path or dataset_path:
@@ -188,13 +189,14 @@ def _extract_metadata_from_files(self) -> None:
188189
and self.metadata_document
189190
and extracted_metadata
190191
and existing_metadata
191-
) and self.explicitly_defined_metadata_document:
192-
self.dataset_consistency_status.extend(
193-
check_dataset_consistency(
194-
self.dataset_path,
195-
self.metadata_document,
192+
):
193+
if extracted_metadata.dataset and existing_metadata.dataset:
194+
self.dataset_consistency_status.extend(
195+
check_dataset_consistency(
196+
UPath(str(extracted_metadata.dataset.file_path)),
197+
UPath(str(existing_metadata.dataset.file_path)),
198+
)
196199
)
197-
)
198200
self.dataset_consistency_status.extend(
199201
check_variables_consistency(
200202
extracted_metadata.variables or [],
@@ -209,10 +211,10 @@ def _extract_metadata_from_files(self) -> None:
209211
merged_metadata = merge_metadata(
210212
extracted_metadata,
211213
existing_metadata,
214+
explicitly_defined_metadata_document=self.explicitly_defined_metadata_document,
212215
)
213-
# We need to override this so that the document gets saved to the correct
214-
# location, otherwise we would overwrite the existing document!
215-
self.metadata_document = self.build_metadata_document_path(
216+
# Ensure the document path corresponds to the dataset path
217+
self.metadata_document = build_metadata_document_path(
216218
self.dataset_path,
217219
)
218220
self._set_metadata(merged_metadata)
@@ -305,9 +307,15 @@ def _extract_metadata_from_existing_document(
305307
datadoc_metadata = fresh_metadata
306308
if datadoc_metadata is None:
307309
return None
308-
return self.metadata_model.DatadocMetadata.model_validate_json(
310+
existing = self.metadata_model.DatadocMetadata.model_validate_json(
309311
json.dumps(datadoc_metadata),
310312
)
313+
314+
# Always override the stored dataset path to ensure it matches
315+
if existing.dataset:
316+
existing.dataset.file_path = str(
317+
self.dataset_path or build_dataset_path(document)
318+
)
311319
except json.JSONDecodeError:
312320
logger.warning(
313321
"Could not open existing metadata file %s. \
@@ -316,6 +324,8 @@ def _extract_metadata_from_existing_document(
316324
exc_info=True,
317325
)
318326
return None
327+
else:
328+
return existing
319329

320330
def _extract_subject_field_from_path(
321331
self,
@@ -403,18 +413,6 @@ def _extract_metadata_from_dataset(
403413
)
404414
return metadata
405415

406-
@staticmethod
407-
def build_metadata_document_path(
408-
dataset_path: ReadablePathLike,
409-
) -> UPath:
410-
"""Build the path to the metadata document corresponding to the given dataset.
411-
412-
Args:
413-
dataset_path: Path to the dataset we wish to create metadata for.
414-
"""
415-
dataset_path = UPath(dataset_path)
416-
return dataset_path.parent / (dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX)
417-
418416
def datadoc_model(
419417
self,
420418
) -> all_optional_model.MetadataContainer | required_model.MetadataContainer:

src/dapla_metadata/datasets/utility/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@
9494

9595
METADATA_DOCUMENT_FILE_SUFFIX = "__DOC.json"
9696

97+
PARQUET_DATASET_FILE_EXTENSION = ".parquet"
98+
9799
PAPIS_STABLE_IDENTIFIER_TYPE = "FREG_SNR"
98100
PAPIS_ENCRYPTION_KEY_REFERENCE = "papis-common-key-1"
99101
DAEAD_ENCRYPTION_KEY_REFERENCE = "ssb-common-key-1"

src/dapla_metadata/datasets/utility/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_SNAPSHOT_DATE
2525
from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_STRATEGY
2626
from dapla_metadata.datasets.utility.constants import ENCRYPTION_PARAMETER_STRATEGY_SKIP
27+
from dapla_metadata.datasets.utility.constants import METADATA_DOCUMENT_FILE_SUFFIX
2728
from dapla_metadata.datasets.utility.constants import NUM_OBLIGATORY_VARIABLES_FIELDS
2829
from dapla_metadata.datasets.utility.constants import (
2930
OBLIGATORY_DATASET_METADATA_IDENTIFIERS,
@@ -42,6 +43,7 @@
4243
)
4344
from dapla_metadata.datasets.utility.constants import PAPIS_ENCRYPTION_KEY_REFERENCE
4445
from dapla_metadata.datasets.utility.constants import PAPIS_STABLE_IDENTIFIER_TYPE
46+
from dapla_metadata.datasets.utility.constants import PARQUET_DATASET_FILE_EXTENSION
4547
from dapla_metadata.datasets.utility.enums import EncryptionAlgorithm
4648

4749
if TYPE_CHECKING:
@@ -601,3 +603,29 @@ def read_variables_from_metadata_document(
601603
]
602604

603605
return metadata_document_variables
606+
607+
608+
def build_metadata_document_path(
609+
dataset_path: ReadablePathLike,
610+
) -> UPath:
611+
"""Build the path to the metadata document corresponding to the given dataset.
612+
613+
Args:
614+
dataset_path: Path to the dataset we wish to create metadata for.
615+
"""
616+
dataset_path = UPath(dataset_path)
617+
return dataset_path.parent / (dataset_path.stem + METADATA_DOCUMENT_FILE_SUFFIX)
618+
619+
620+
def build_dataset_path(
621+
metadata_document_path: ReadablePathLike,
622+
) -> UPath:
623+
"""Build the path to the dataset corresponding to the given metadata document.
624+
625+
Args:
626+
metadata_document_path: Path to the existing metadata document.
627+
"""
628+
return UPath(
629+
str(metadata_document_path).replace(METADATA_DOCUMENT_FILE_SUFFIX, "")
630+
+ PARQUET_DATASET_FILE_EXTENSION
631+
)

0 commit comments

Comments
 (0)