Adjust TODOs and initiate Ruff formatting;

VarenyaJ · VarenyaJ · commit 86761a1ef425 · 2025-08-07T14:21:25.000+02:00
diff --git a/src/P6/__main__.py b/src/P6/__main__.py
@@ -200,15 +200,21 @@ def parse_excel(
         phenotype_records,
         disease_records,
         measurement_records,
-        biosample_records
+        biosample_records,
     ) = mapper.apply_mapping(tables, notepad)
     # TODO: Come back and add more top-level fields
 
     # 5) Report any errors or warnings
     _report_issues(notepad)
 
     # 6) Group results by patient
-    records_by_patient = _group_records_by_patient(genotype_records, phenotype_records, disease_records, measurement_records, biosample_records)
+    records_by_patient = _group_records_by_patient(
+        genotype_records,
+        phenotype_records,
+        disease_records,
+        measurement_records,
+        biosample_records,
+    )
 
     # 7) Prepare output directory with timestamp
     # Will contain genotype and phenotype records as JSON
@@ -262,10 +268,22 @@ def _report_issues(notepad):
 
 
 def _group_records_by_patient(
-    genotype_records: list, phenotype_records: list, disease_records: list, measurement_records: list, biosample_records: list
+    genotype_records: list,
+    phenotype_records: list,
+    disease_records: list,
+    measurement_records: list,
+    biosample_records: list,
 ) -> dict[str, dict[str, list]]:
     # Group genotype & phenotype records by patient ID
-    records = defaultdict(lambda: {"genotype_records": [], "phenotype_records": [], "disease_records": [], "measurement_records": [], "biosample_records": []})
+    records = defaultdict(
+        lambda: {
+            "genotype_records": [],
+            "phenotype_records": [],
+            "disease_records": [],
+            "measurement_records": [],
+            "biosample_records": [],
+        }
+    )
     for genotype in genotype_records:
         records[genotype.genotype_patient_ID]["genotype_records"].append(genotype)
     for phenotype in phenotype_records:
diff --git a/src/P6/biosample.py b/src/P6/biosample.py
@@ -6,6 +6,7 @@
 
 from dataclasses import dataclass
 
+
 @dataclass
 class BiosampleRecord:
     """
@@ -17,6 +18,7 @@ class BiosampleRecord:
         biosample_type: CURIE of the tissue or sample type (e.g. 'UBERON:0002107').
         collection_date: Date string in 'YYYY-MM-DD' format.
     """
+
     patient_ID: str
     biosample_id: str
     biosample_type: str
diff --git a/src/P6/disease.py b/src/P6/disease.py
@@ -6,6 +6,7 @@
 
 from dataclasses import dataclass
 
+
 @dataclass
 class DiseaseRecord:
     """
@@ -18,6 +19,7 @@ class DiseaseRecord:
         disease_onset: Date string in 'YYYY-MM-DD' format.
         disease_status: True if the disease is present, False if excluded.
     """
+
     patient_ID: str
     disease_term: str
     disease_label: str
diff --git a/src/P6/mapper.py b/src/P6/mapper.py
@@ -51,9 +51,9 @@
 PHENOTYPE_KEY_COLUMNS = {"hpo_id", "date_of_observation", "status"}
 
 # Key columns to identify additional sheets
-DISEASE_KEY_COLUMNS     = {"disease_term", "disease_onset"}
+DISEASE_KEY_COLUMNS = {"disease_term", "disease_onset"}
 MEASUREMENT_KEY_COLUMNS = {"measurement_type", "measurement_value", "measurement_unit"}
-BIOSAMPLE_KEY_COLUMNS   = {"biosample_id", "biosample_type", "collection_date"}
+BIOSAMPLE_KEY_COLUMNS = {"biosample_id", "biosample_type", "collection_date"}
 
 
 # Map raw zygosity abbreviations to allowed dataclass zygosity values
@@ -110,34 +110,37 @@ def apply_mapping(
         list[DiseaseRecord],
         list[MeasurementRecord],
         list[BiosampleRecord],
-        ]:
+    ]:
         """
         1) classify each sheet as genotype / phenotype / disease / measurement / biosample
         2) call the matching mapper
         """
         # initialize the lists to return
-        genotype_records: list[Genotype]                = []
-        phenotype_records: list[Phenotype]              = []
-        disease_records: list[DiseaseRecord]            = []
-        measurement_records: list[MeasurementRecord]    = []
-        biosample_records: list[BiosampleRecord]        = []
-
+        genotype_records: list[Genotype] = []
+        phenotype_records: list[Phenotype] = []
+        disease_records: list[DiseaseRecord] = []
+        measurement_records: list[MeasurementRecord] = []
+        biosample_records: list[BiosampleRecord] = []
 
         for sheet_name, df in tables.items():
             columns = set(df.columns)
             """ 1) classify: does this look like genotype, phenotype, or something to skip? """
             has_raw = RAW_VARIANT_COLUMNS.issubset(columns)
             has_hgvs = bool(HGVS_VARIANT_COLUMNS & columns)
             """Send each sheet to the right extractor and collect all records."""
-            is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and (has_raw or has_hgvs)
+            is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and (
+                has_raw or has_hgvs
+            )
             is_phenotype_sheet = PHENOTYPE_KEY_COLUMNS.issubset(columns)
 
             if is_genotype_sheet == is_phenotype_sheet:
                 # if we have both raw & HGVS notations, we need to validate that they match
                 if has_raw and has_hgvs:
                     self._check_hgvs_consistency(sheet_name, df, notepad)
                 # ambiguous sheet should give a warning instead of an error
-                notepad.add_warning(f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype")
+                notepad.add_warning(
+                    f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype"
+                )
                 continue
 
             # rename the former-index column
@@ -157,10 +160,14 @@ def apply_mapping(
                 disease_records.extend(self._map_disease(sheet_name, working, notepad))
                 continue
             if MEASUREMENT_KEY_COLUMNS.issubset(columns):
-                measurement_records.extend(self._map_measurement(sheet_name, working, notepad))
+                measurement_records.extend(
+                    self._map_measurement(sheet_name, working, notepad)
+                )
                 continue
             if BIOSAMPLE_KEY_COLUMNS.issubset(columns):
-                biosample_records.extend(self._map_biosample(sheet_name, working, notepad))
+                biosample_records.extend(
+                    self._map_biosample(sheet_name, working, notepad)
+                )
                 continue
 
         return (
@@ -376,26 +383,30 @@ def _map_phenotype(
 
         return records
 
-    def _map_disease(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[DiseaseRecord]:
+    def _map_disease(
+        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
+    ) -> list[DiseaseRecord]:
+        # TODO: implement row→DiseaseRecord, row→MeasurementRecord conversion, and row→BiosampleRecord conversions
         """
         Map each row in a disease sheet to a DiseaseRecord.
         """
-        records: list[DiseaseRecord] = []
-        # TODO: implement row→DiseaseRecord conversion
+        # TODO: fix as this is not in use now: records: list[DiseaseRecord] = []
         raise NotImplementedError
 
-    def _map_measurement(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[MeasurementRecord]:
+    def _map_measurement(
+        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
+    ) -> list[MeasurementRecord]:
         """
         Map each row in a measurement sheet to a MeasurementRecord.
         """
-        records: list[MeasurementRecord] = []
-        # TODO: implement row→MeasurementRecord conversion
+        # TODO: fix as this is not in use now: records: list[MeasurementRecord] = []
         raise NotImplementedError
 
-    def _map_biosample(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[BiosampleRecord]:
+    def _map_biosample(
+        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
+    ) -> list[BiosampleRecord]:
         """
         Map each row in a biosample sheet to a BiosampleRecord.
         """
-        records: list[BiosampleRecord] = []
-        # TODO: implement row→BiosampleRecord conversion
+        # TODO: fix as this is not in use now: records: list[BiosampleRecord] = []
         raise NotImplementedError
diff --git a/src/P6/measurement.py b/src/P6/measurement.py
@@ -6,6 +6,7 @@
 
 from dataclasses import dataclass
 
+
 @dataclass
 class MeasurementRecord:
     """
@@ -18,6 +19,7 @@ class MeasurementRecord:
         measurement_unit: Unit CURIE or string (e.g. 'mmol/L').
         measurement_timestamp: ISO timestamp string (e.g. '2025-07-15T14:23:00').
     """
+
     patient_ID: str
     measurement_type: str
     measurement_value: float
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -24,6 +24,7 @@ def fpath_hpo(fpath_test_dir: str) -> str:
 def hpo(fpath_hpo: str) -> hpotk.MinimalOntology:
     return hpotk.load_minimal_ontology(fpath_hpo)
 
+
 # use the `hpo` already defined above in this file
 @pytest.fixture
 def decompressed_hpo(fpath_hpo: str, tmp_path: Path) -> str:
@@ -33,4 +34,4 @@ def decompressed_hpo(fpath_hpo: str, tmp_path: Path) -> str:
     out = tmp_path / "hp.json"
     with gzip.open(fpath_hpo, "rb") as f_in, open(out, "wb") as f_out:
         shutil.copyfileobj(f_in, f_out)
-    return str(out)
+    return str(out)
diff --git a/tests/test_full_features.py b/tests/test_full_features.py
@@ -2,22 +2,19 @@
 import re
 import glob
 import json
-import pytest
 from click.testing import CliRunner
 from P6.__main__ import main
 
 # Path to the “full” example workbook (e.g. created under tests/data/full_example.xlsx)
-FULL_XLSX = os.path.join(os.path.dirname(__file__), "data", "Sydney_Python_transformation.xlsx")
+FULL_XLSX = os.path.join(
+    os.path.dirname(__file__), "data", "Sydney_Python_transformation.xlsx"
+)
+
 
 def test_full_features_parse_creates_all_blocks(decompressed_hpo):
     runner = CliRunner()
     result = runner.invoke(
-        main,
-        [
-            "parse-excel",
-            "-e", FULL_XLSX,
-            "--custom-hpo", decompressed_hpo
-        ],
+        main, ["parse-excel", "-e", FULL_XLSX, "--custom-hpo", decompressed_hpo]
     )
     assert result.exit_code == 0, result.output
 
@@ -32,6 +29,6 @@ def test_full_features_parse_creates_all_blocks(decompressed_hpo):
 
     # Load the first phenopacket and check new keys
     pkt = json.load(open(json_files[0], encoding="utf-8"))
-    assert isinstance(pkt.get("diseases", []), list),      "Missing 'diseases' block"
+    assert isinstance(pkt.get("diseases", []), list), "Missing 'diseases' block"
     assert isinstance(pkt.get("measurements", []), list), "Missing 'measurements' block"
-    assert isinstance(pkt.get("biosamples", []), list),   "Missing 'biosamples' block"
+    assert isinstance(pkt.get("biosamples", []), list), "Missing 'biosamples' block"