Skip to content

Commit a94a51e

Browse files
authored
Merge pull request #265 from SciCatProject/resilient-schemas
Use output DTO and ignore all unknown fields
2 parents e38edc6 + 852a0a5 commit a94a51e

File tree

11 files changed

+102
-284
lines changed

11 files changed

+102
-284
lines changed

src/scitacean/_base_model.py

-45
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from datetime import datetime
1111
from typing import (
1212
Any,
13-
ClassVar,
1413
TypeVar,
1514
overload,
1615
)
@@ -48,50 +47,6 @@ class DatasetType(str, Enum): # type: ignore[no-redef]
4847
class BaseModel(pydantic.BaseModel):
4948
"""Base class for Pydantic models for communication with SciCat."""
5049

51-
model_config = pydantic.ConfigDict(
52-
extra="forbid",
53-
)
54-
55-
_user_mask: ClassVar[tuple[str, ...]]
56-
_masked_fields: ClassVar[tuple[str, ...] | None] = None
57-
58-
# Some schemas contain fields that we don't want to use in Scitacean.
59-
# Normally, omitting them from the model would result in an error when
60-
# building a model from the JSON returned by SciCat.
61-
# The following subclass hook allows models to mark fields as masked.
62-
# Those will be silently dropped by __init__.
63-
# Note also the comment for _IGNORED_KWARGS below.
64-
def __init_subclass__(
65-
cls, /, masked: Iterable[str] | None = None, **kwargs: Any
66-
) -> None:
67-
super().__init_subclass__(**kwargs)
68-
cls._user_mask = tuple(masked) if masked is not None else ()
69-
70-
def __init__(self, **kwargs: Any) -> None:
71-
self._delete_ignored_args(kwargs)
72-
super().__init__(**kwargs)
73-
74-
def _delete_ignored_args(self, args: dict[str, Any]) -> None:
75-
if self._masked_fields is None:
76-
self._init_mask(self)
77-
for key in self._masked_fields: # type: ignore[union-attr]
78-
args.pop(key, None)
79-
80-
# Initializing the mask requires the field names which
81-
# are only available on instances.
82-
# So initialization needs to be deferred until the first instantiation of the model.
83-
# The mask is cached afterward.
84-
@classmethod
85-
def _init_mask(cls: type[ModelType], instance: ModelType) -> None:
86-
def get_name(name: str, field: Any) -> Any:
87-
return field.alias if field.alias is not None else name
88-
89-
field_names = {
90-
get_name(name, field) for name, field in instance.model_fields.items()
91-
}
92-
default_mask = tuple(key for key in _IGNORED_KWARGS if key not in field_names)
93-
cls._masked_fields = cls._user_mask + default_mask
94-
9550
@classmethod
9651
def user_model_type(cls) -> type[BaseUserModel] | None:
9752
"""Return the user model type for this model.

src/scitacean/_dataset_fields.py

+65-91
Large diffs are not rendered by default.

src/scitacean/model.py

+11-29
Original file line numberDiff line numberDiff line change
@@ -101,14 +101,12 @@
101101
from .thumbnail import Thumbnail
102102

103103

104-
# TODO remove extra masks after API v4
105-
class DownloadDataset(
106-
BaseModel, masked=("history", "proposalId", "sampleId", "instrumentId")
107-
):
104+
class DownloadDataset(BaseModel):
108105
contactEmail: str | None = None
109106
creationLocation: str | None = None
110107
creationTime: datetime | None = None
111108
inputDatasets: list[PID] | None = None
109+
investigator: str | None = None
112110
numberOfFilesArchived: NonNegativeInt | None = None
113111
owner: str | None = None
114112
ownerGroup: str | None = None
@@ -127,7 +125,7 @@ class DownloadDataset(
127125
description: str | None = None
128126
endTime: datetime | None = None
129127
instrumentGroup: str | None = None
130-
instrumentIds: list[str] | None = None
128+
instrumentId: str | None = None
131129
isPublished: bool | None = None
132130
jobLogData: str | None = None
133131
jobParameters: dict[str, Any] | None = None
@@ -141,9 +139,10 @@ class DownloadDataset(
141139
ownerEmail: str | None = None
142140
packedSize: NonNegativeInt | None = None
143141
pid: PID | None = None
144-
proposalIds: list[str] | None = None
142+
proposalId: str | None = None
145143
relationships: list[DownloadRelationship] | None = None
146-
sampleIds: list[str] | None = None
144+
runNumber: str | None = None
145+
sampleId: str | None = None
147146
sharedWith: list[str] | None = None
148147
size: NonNegativeInt | None = None
149148
sourceFolderHost: str | None = None
@@ -167,25 +166,6 @@ def _validate_emails(cls, value: Any) -> Any:
167166
def _validate_orcids(cls, value: Any) -> Any:
168167
return validate_orcids(value)
169168

170-
# TODO remove after API v4
171-
@pydantic.field_validator("sampleIds", mode="before")
172-
def _validate_sample_ids(cls, value: Any) -> Any:
173-
if value == [None]:
174-
return []
175-
return value
176-
177-
@pydantic.field_validator("proposalIds", mode="before")
178-
def _validate_proposal_ids(cls, value: Any) -> Any:
179-
if value == [None]:
180-
return []
181-
return value
182-
183-
@pydantic.field_validator("instrumentIds", mode="before")
184-
def _validate_instrument_ids(cls, value: Any) -> Any:
185-
if value == [None]:
186-
return []
187-
return value
188-
189169

190170
class UploadDerivedDataset(BaseModel):
191171
contactEmail: str
@@ -198,7 +178,6 @@ class UploadDerivedDataset(BaseModel):
198178
sourceFolder: RemotePath
199179
type: DatasetType
200180
usedSoftware: list[str]
201-
datasetName: str
202181
accessGroups: list[str] | None = None
203182
classification: str | None = None
204183
comment: str | None = None
@@ -211,12 +190,14 @@ class UploadDerivedDataset(BaseModel):
211190
keywords: list[str] | None = None
212191
license: str | None = None
213192
scientificMetadata: dict[str, Any] | None = None
193+
datasetName: str | None = None
214194
numberOfFiles: NonNegativeInt | None = None
215195
orcidOfOwner: str | None = None
216196
ownerEmail: str | None = None
217197
packedSize: NonNegativeInt | None = None
218198
proposalId: str | None = None
219199
relationships: list[UploadRelationship] | None = None
200+
runNumber: str | None = None
220201
sharedWith: list[str] | None = None
221202
size: NonNegativeInt | None = None
222203
sourceFolderHost: str | None = None
@@ -241,15 +222,14 @@ class UploadRawDataset(BaseModel):
241222
creationLocation: str
242223
creationTime: datetime
243224
inputDatasets: list[PID]
225+
investigator: str
244226
numberOfFilesArchived: NonNegativeInt
245227
owner: str
246228
ownerGroup: str
247229
principalInvestigator: str
248230
sourceFolder: RemotePath
249231
type: DatasetType
250232
usedSoftware: list[str]
251-
datasetName: str
252-
investigator: str | None = None
253233
accessGroups: list[str] | None = None
254234
classification: str | None = None
255235
comment: str | None = None
@@ -265,12 +245,14 @@ class UploadRawDataset(BaseModel):
265245
keywords: list[str] | None = None
266246
license: str | None = None
267247
scientificMetadata: dict[str, Any] | None = None
248+
datasetName: str | None = None
268249
numberOfFiles: NonNegativeInt | None = None
269250
orcidOfOwner: str | None = None
270251
ownerEmail: str | None = None
271252
packedSize: NonNegativeInt | None = None
272253
proposalId: str | None = None
273254
relationships: list[UploadRelationship] | None = None
255+
runNumber: str | None = None
274256
sampleId: str | None = None
275257
sharedWith: list[str] | None = None
276258
size: NonNegativeInt | None = None

src/scitacean/testing/backend/seed.py

+4
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@
142142
sourceFolder=RemotePath("/hex/raw1"),
143143
type=DatasetType.RAW,
144144
principalInvestigator="investigator 1",
145+
investigator="investigator 1",
145146
creationLocation="UU",
146147
proposalId="p0124",
147148
inputDatasets=[],
@@ -159,6 +160,7 @@
159160
sourceFolder=RemotePath("/hex/raw2"),
160161
type=DatasetType.RAW,
161162
principalInvestigator="investigator 2",
163+
investigator="investigator 2",
162164
creationLocation="UU",
163165
proposalId="p0124",
164166
inputDatasets=[],
@@ -176,6 +178,7 @@
176178
sourceFolder=RemotePath("/hex/raw3"),
177179
type=DatasetType.RAW,
178180
principalInvestigator="investigator 1",
181+
investigator="investigator 1",
179182
creationLocation="UU",
180183
proposalId="p0124",
181184
inputDatasets=[],
@@ -193,6 +196,7 @@
193196
sourceFolder=RemotePath("/hex/raw4"),
194197
type=DatasetType.RAW,
195198
principalInvestigator="investigator X",
199+
investigator="investigator X",
196200
creationLocation="UU",
197201
inputDatasets=[],
198202
usedSoftware=[],

tests/client/query_client_test.py

+4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
from scitacean import PID, Client, RemotePath, model
99
from scitacean.testing.backend import seed
1010

11+
pytestmark = pytest.mark.skip(
12+
"Querying is currently broken because of a mismatch between DTOs and schemas."
13+
)
14+
1115

1216
@pytest.fixture
1317
def client(real_client: Client, require_scicat_backend: None) -> Client:

tests/dataset_fields_test.py

+2-62
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import dateutil.parser
1111
import pydantic
1212
import pytest
13-
from hypothesis import assume, given, settings
13+
from hypothesis import given, settings
1414
from hypothesis import strategies as st
1515

1616
from scitacean import PID, Dataset, DatasetType
@@ -354,6 +354,7 @@ def test_make_raw_model() -> None:
354354
owner="Ponder Stibbons;Mustrum Ridcully",
355355
ownerGroup="faculty",
356356
principalInvestigator="my principal investigator",
357+
investigator="my principal investigator",
357358
sourceFolder=RemotePath("/hex/source62"),
358359
type=DatasetType.RAW,
359360
scientificMetadata=None,
@@ -404,67 +405,6 @@ def test_make_derived_model() -> None:
404405
assert dset.make_upload_model() == expected
405406

406407

407-
@pytest.mark.parametrize(
408-
"field",
409-
(
410-
f
411-
for f in Dataset.fields(dataset_type="derived", read_only=False)
412-
if not f.used_by_raw and f.name not in _UNGENERATABLE_FIELDS
413-
),
414-
ids=lambda f: f.name,
415-
)
416-
@given(st.data())
417-
@settings(max_examples=10)
418-
def test_make_raw_model_raises_if_derived_field_set(
419-
field: Dataset.Field, data: st.DataObject
420-
) -> None:
421-
dset = Dataset(
422-
type="raw",
423-
contact_email="[email protected]",
424-
creation_time="2142-04-02T16:44:56",
425-
owner="Mustrum Ridcully",
426-
owner_group="faculty",
427-
principal_investigator="[email protected]",
428-
source_folder=RemotePath("/hex/source62"),
429-
)
430-
val = data.draw(st.from_type(field.type))
431-
assume(val is not None)
432-
with pytest.raises(pydantic.ValidationError):
433-
dset.make_upload_model()
434-
435-
436-
@pytest.mark.parametrize(
437-
"field",
438-
(
439-
f
440-
for f in Dataset.fields(dataset_type="raw", read_only=False)
441-
if not f.used_by_derived and f.name not in _UNGENERATABLE_FIELDS
442-
),
443-
ids=lambda f: f.name,
444-
)
445-
@given(st.data())
446-
@settings(max_examples=10)
447-
def test_make_derived_model_raises_if_raw_field_set(
448-
field: Dataset.Field, data: st.DataObject
449-
) -> None:
450-
dset = Dataset(
451-
type="derived",
452-
contact_email="[email protected]",
453-
creation_time="2142-04-02T16:44:56",
454-
owner="Ponder Stibbons",
455-
owner_group="faculty",
456-
investigator="[email protected]",
457-
source_folder=RemotePath("/hex/source62"),
458-
input_datasets=[PID(pid="623-122")],
459-
used_software=["scitacean", "magick"],
460-
)
461-
val = data.draw(st.from_type(field.type))
462-
assume(val is not None)
463-
setattr(dset, field.name, val)
464-
with pytest.raises(pydantic.ValidationError):
465-
dset.make_upload_model()
466-
467-
468408
@pytest.mark.parametrize("field", ["contact_email", "owner_email"])
469409
def test_email_validation(field: Dataset.Field) -> None:
470410
dset = Dataset(

tests/dataset_test.py

+6-9
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def raw_download_model() -> model.DownloadDataset:
4343
description="Some shady data",
4444
endTime=parse_datetime("1995-08-03T00:00:00Z"),
4545
instrumentGroup="professors",
46-
instrumentIds=["0000-aa"],
46+
instrumentId="0000-aa",
4747
isPublished=True,
4848
jobLogData=None,
4949
jobParameters=None,
@@ -55,8 +55,8 @@ def raw_download_model() -> model.DownloadDataset:
5555
ownerEmail="[email protected]",
5656
packedSize=0,
5757
pid=PID.parse("123.cc/948.f7.2a"),
58-
proposalIds=["33.dc"],
59-
sampleIds=["bac.a4"],
58+
proposalId="33.dc",
59+
sampleId="bac.a4",
6060
sharedWith=["librarian"],
6161
size=400,
6262
sourceFolderHost="ftp://uu.am/data",
@@ -111,7 +111,7 @@ def derived_download_model() -> model.DownloadDataset:
111111
description="Dubiously analyzed data",
112112
endTime=None,
113113
instrumentGroup="professors",
114-
instrumentIds=None,
114+
instrumentId=None,
115115
isPublished=True,
116116
jobLogData="process interrupted",
117117
jobParameters={"nodes": 4},
@@ -123,8 +123,8 @@ def derived_download_model() -> model.DownloadDataset:
123123
ownerEmail="[email protected]",
124124
packedSize=0,
125125
pid=PID.parse("123.cc/948.f7.2a"),
126-
proposalIds=None,
127-
sampleIds=None,
126+
proposalId=None,
127+
sampleId=None,
128128
sharedWith=["librarian"],
129129
size=400,
130130
sourceFolderHost="ftp://uu.am/data",
@@ -365,11 +365,8 @@ def test_dataset_models_roundtrip(initial: Dataset) -> None:
365365
# TODO remove in API v4
366366
rebuilt.investigator = initial.investigator
367367
rebuilt.proposal_id = initial.proposal_id
368-
initial._proposal_ids = rebuilt.proposal_ids # type: ignore[assignment]
369368
rebuilt.sample_id = initial.sample_id
370-
initial._sample_ids = rebuilt.sample_ids # type: ignore[assignment]
371369
rebuilt.instrument_id = initial.instrument_id
372-
initial._instrument_ids = rebuilt.instrument_ids # type: ignore[assignment]
373370

374371
assert initial == rebuilt
375372

tests/model_test.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ def test_raw_dataset_default_values(
182182
owner=scicat_access.user.username,
183183
ownerGroup=scicat_access.user.group,
184184
principalInvestigator="[email protected]",
185+
investigator="[email protected]",
185186
sourceFolder=RemotePath("/source/folder"),
186187
type=DatasetType.RAW,
187188
usedSoftware=["software1"],
@@ -207,15 +208,15 @@ def test_raw_dataset_default_values(
207208
assert finalized.createdAt # some non-empty str
208209
assert finalized.createdBy # some non-empty str
209210
assert finalized.classification # some non-empty str
210-
assert finalized.instrumentIds == []
211+
assert finalized.instrumentId is None
211212
assert finalized.isPublished is False
212213
assert finalized.keywords == []
213214
assert finalized.numberOfFiles == 0
214215
assert finalized.numberOfFilesArchived == 0
215216
assert finalized.packedSize == 0
216217
assert finalized.pid # some non-empty str
217-
assert finalized.proposalIds == []
218-
assert finalized.sampleIds == []
218+
assert finalized.proposalId is None
219+
assert finalized.sampleId is None
219220
assert finalized.scientificMetadata == {}
220221
assert finalized.sharedWith == []
221222
assert finalized.size == 0
@@ -271,7 +272,7 @@ def test_fields_override_masks() -> None:
271272
assert not hasattr(mod, "_id")
272273

273274

274-
def test_fields_override_masks_att() -> None:
275+
def test_fields_override_masks_attachment() -> None:
275276
# 'id' is masked but the model has a field 'id' without alias
276277
mod = DownloadAttachment( # type: ignore[call-arg]
277278
_id="abc",

0 commit comments

Comments
 (0)