Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 101 additions & 10 deletions relecov_tools/build_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,11 @@ def _load_laboratory_addresses(self):
Returns two dictionaries with key in the three special fields:
- dropdowns[field] ........ list ‘<name> [<city>] [<ccn>]’
- uniques[field] .......... unique names for schema enum

NOTE:
For RELECOV, laboratory_address.json stores institution names under
`collecting_institution`. We intentionally reuse that same source for
collecting/submitting/sequencing to keep the three schema enums aligned.
"""
json_path = os.path.join(
os.path.dirname(__file__),
Expand All @@ -241,12 +246,13 @@ def _load_laboratory_addresses(self):

for ccn, info in lab_data.items():
city = info.get("geo_loc_city", "").strip()

name = info.get("collecting_institution", "").strip()
if not name:
continue
dropdown_entry = f"{name} [{city}] [{ccn}]"
for f in fields:
name = info.get(f, "").strip()
if name:
dropdowns[f].append(f"{name} [{city}] [{ccn}]")
uniques[f].add(name)
dropdowns[f].append(dropdown_entry)
uniques[f].add(name)

dropdowns = {k: sorted(v) for k, v in dropdowns.items()}
uniques = {k: sorted(v) for k, v in uniques.items()}
Expand Down Expand Up @@ -455,8 +461,74 @@ def create_schema_draft_template(self):
)
return draft_template

def _cast_example_to_type(
self, property_id: str, expected_type: str | None, value: any
) -> any:
"""Cast a single example value to the declared JSON-schema type when possible."""
if not isinstance(expected_type, str):
return value
expected = expected_type.strip().lower()
if expected == "string":
return str(value)
if expected == "integer":
try:
parsed_number = float(value)
except (TypeError, ValueError):
self.log.warning(
"Example value %r for property '%s' does not match expected type 'integer'. Keeping original value.",
value,
property_id,
)
return value
if not parsed_number.is_integer():
self.log.warning(
"Example value %r for property '%s' does not match expected type 'integer'. Keeping original value.",
value,
property_id,
)
return value
return int(parsed_number)
if expected == "number":
try:
return float(value)
except (TypeError, ValueError):
self.log.warning(
"Example value %r for property '%s' does not match expected type 'number'. Keeping original value.",
value,
property_id,
)
return value
if expected == "boolean":
if isinstance(value, bool):
return value
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in ("true", "1", "yes", "y"):
return True
if normalized in ("false", "0", "no", "n"):
return False
self.log.warning(
"Example value %r for property '%s' does not match expected type 'boolean'. Keeping original value.",
value,
property_id,
)
return value
return value

def _cast_examples_to_declared_type(
self, property_id: str, expected_type: str | None, values: list[any]
) -> list[any]:
return [
self._cast_example_to_type(property_id, expected_type, item)
for item in values
]

def jsonschema_object(
self, property_id: str, property_feature_key: str, value: any
self,
property_id: str,
property_feature_key: str,
value: any,
expected_type: str | None = None,
) -> dict[str, any]:
"""
Process a property keyword with their value and return a dictionary with fields for a property.
Expand Down Expand Up @@ -487,15 +559,25 @@ def jsonschema_object(
jsonschema_value[key] = value
# FIXME multiple examples will always be loaded as str, regardless of actual type
case "examples", str(value):
jsonschema_value = {property_feature_key: value.split("; ")}
parsed_examples = value.split("; ")
parsed_examples = self._cast_examples_to_declared_type(
property_id, expected_type, parsed_examples
)
jsonschema_value = {property_feature_key: parsed_examples}
case "examples", datetime():
value = value.strftime("%Y-%m-%dT%H:%M:%S")
value = value.replace("T00:00:00", "")
jsonschema_value = {property_feature_key: [value]}
parsed_examples = self._cast_examples_to_declared_type(
property_id, expected_type, [value]
)
jsonschema_value = {property_feature_key: parsed_examples}
case "examples", int(value) | float(value):
value = float(value)
value = [int(value) if value.is_integer() else value]
jsonschema_value = {property_feature_key: value}
parsed_examples = [int(value) if value.is_integer() else value]
parsed_examples = self._cast_examples_to_declared_type(
property_id, expected_type, parsed_examples
)
jsonschema_value = {property_feature_key: parsed_examples}
case "enum", str():
jsonschema_value = {"$ref": f"#/$defs/enums/{property_id}"}
case _, value if not pd.isna(value):
Expand Down Expand Up @@ -532,6 +614,14 @@ def handle_properties(self, json_data: dict[str, dict]) -> tuple[dict, dict, dic
for property_id, db_features_dic in json_data.items():
is_required = db_features_dic.get("required (Y/N)", "") == "Y"
has_enum = db_features_dic.get("enum", False)
if property_id in [
"collecting_institution",
"submitting_institution",
"sequencing_institution",
]:
lab_values = self._lab_uniques.get(property_id, [])
if lab_values:
has_enum = "; ".join(lab_values)

# Create empty placeholder
schema_property[property_id] = {}
Expand Down Expand Up @@ -569,6 +659,7 @@ def handle_properties(self, json_data: dict[str, dict]) -> tuple[dict, dict, dic
property_id,
mapping_features[db_feature_key],
db_feature_value,
expected_type=db_features_dic.get("type"),
)
if std_json_feature:
schema_property[property_id].update(std_json_feature)
Expand Down
3 changes: 2 additions & 1 deletion relecov_tools/conf/configuration.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
"required_copy_from_other_field"
],
"schema_file": "relecov_schema.json",
"cast_values_from_schema": false,
"unique_sample_id": "sequencing_sample_id",
"fixed_fields": {
"study_type": "Whole Genome Sequencing",
Expand Down Expand Up @@ -180,7 +181,7 @@
},
"upload_to_ena": {
"ENA_configuration": {},
"checklist": "",
"checklist": "default_checklist",
"templates_path": "",
"tool": {
"tool_name": "ena-upload-cli",
Expand Down
132 changes: 0 additions & 132 deletions relecov_tools/conf/initial_config.yaml

This file was deleted.

57 changes: 57 additions & 0 deletions relecov_tools/conf/read_lab_metadata_heading_default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
[
"Organism",
"Public Health sample id (SIVIRA)",
"Sample ID given by originating laboratory",
"Sample ID given by the submitting laboratory",
"Sample ID given in the microbiology lab",
"Sample ID given if multiple rna-extraction or passages",
"Sample ID given for sequencing",
"ENA Sample ID",
"GISAID Virus Name",
"GISAID id",
"Originating Laboratory",
"Submitting Institution",
"Sequencing Institution",
"Sample Collection Date",
"Sample Received Date",
"Purpose of sampling",
"Biological Sample Storage Condition",
"Specimen source",
"Environmental Material",
"Environmental System",
"Collection Device",
"Host",
"Host Age Years",
"Host Age Months",
"Host Gender",
"Vaccinated",
"Specific medication for treatment or prophylaxis",
"Hospitalization",
"Admission to intensive care unit",
"Death",
"Immunosuppression",
"Sequencing Date",
"Nucleic acid extraction protocol",
"Commercial All-in-one library kit",
"Library Preparation Kit",
"Enrichment Protocol",
"If Enrichment Protocol Is Other, Specify",
"Enrichment panel/assay",
"If Enrichment panel/assay Is Other, Specify",
"Enrichment panel/assay version",
"Number Of Samples In Run",
"Runid",
"Sequencing Instrument Model",
"Flowcell Kit",
"Source material",
"Capture method",
"Sequencing technique",
"Library Layout",
"Gene Name 1",
"Diagnostic Pcr Ct Value 1",
"Gene Name 2",
"Diagnostic Pcr Ct Value-2",
"Authors",
"Sequence file R1",
"Sequence file R2"
]
Loading
Loading