Skip to content

Commit 86761a1

Browse files
committed
Adjust TODOs and initiate Ruff formatting;
1 parent 86028e0 commit 86761a1

7 files changed

Lines changed: 70 additions & 37 deletions

File tree

src/P6/__main__.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,15 +200,21 @@ def parse_excel(
200200
phenotype_records,
201201
disease_records,
202202
measurement_records,
203-
biosample_records
203+
biosample_records,
204204
) = mapper.apply_mapping(tables, notepad)
205205
# TODO: Come back and add more top-level fields
206206

207207
# 5) Report any errors or warnings
208208
_report_issues(notepad)
209209

210210
# 6) Group results by patient
211-
records_by_patient = _group_records_by_patient(genotype_records, phenotype_records, disease_records, measurement_records, biosample_records)
211+
records_by_patient = _group_records_by_patient(
212+
genotype_records,
213+
phenotype_records,
214+
disease_records,
215+
measurement_records,
216+
biosample_records,
217+
)
212218

213219
# 7) Prepare output directory with timestamp
214220
# Will contain genotype and phenotype records as JSON
@@ -262,10 +268,22 @@ def _report_issues(notepad):
262268

263269

264270
def _group_records_by_patient(
265-
genotype_records: list, phenotype_records: list, disease_records: list, measurement_records: list, biosample_records: list
271+
genotype_records: list,
272+
phenotype_records: list,
273+
disease_records: list,
274+
measurement_records: list,
275+
biosample_records: list,
266276
) -> dict[str, dict[str, list]]:
267277
# Group genotype & phenotype records by patient ID
268-
records = defaultdict(lambda: {"genotype_records": [], "phenotype_records": [], "disease_records": [], "measurement_records": [], "biosample_records": []})
278+
records = defaultdict(
279+
lambda: {
280+
"genotype_records": [],
281+
"phenotype_records": [],
282+
"disease_records": [],
283+
"measurement_records": [],
284+
"biosample_records": [],
285+
}
286+
)
269287
for genotype in genotype_records:
270288
records[genotype.genotype_patient_ID]["genotype_records"].append(genotype)
271289
for phenotype in phenotype_records:

src/P6/biosample.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from dataclasses import dataclass
88

9+
910
@dataclass
1011
class BiosampleRecord:
1112
"""
@@ -17,6 +18,7 @@ class BiosampleRecord:
1718
biosample_type: CURIE of the tissue or sample type (e.g. 'UBERON:0002107').
1819
collection_date: Date string in 'YYYY-MM-DD' format.
1920
"""
21+
2022
patient_ID: str
2123
biosample_id: str
2224
biosample_type: str

src/P6/disease.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from dataclasses import dataclass
88

9+
910
@dataclass
1011
class DiseaseRecord:
1112
"""
@@ -18,6 +19,7 @@ class DiseaseRecord:
1819
disease_onset: Date string in 'YYYY-MM-DD' format.
1920
disease_status: True if the disease is present, False if excluded.
2021
"""
22+
2123
patient_ID: str
2224
disease_term: str
2325
disease_label: str

src/P6/mapper.py

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@
5151
PHENOTYPE_KEY_COLUMNS = {"hpo_id", "date_of_observation", "status"}
5252

5353
# Key columns to identify additional sheets
54-
DISEASE_KEY_COLUMNS = {"disease_term", "disease_onset"}
54+
DISEASE_KEY_COLUMNS = {"disease_term", "disease_onset"}
5555
MEASUREMENT_KEY_COLUMNS = {"measurement_type", "measurement_value", "measurement_unit"}
56-
BIOSAMPLE_KEY_COLUMNS = {"biosample_id", "biosample_type", "collection_date"}
56+
BIOSAMPLE_KEY_COLUMNS = {"biosample_id", "biosample_type", "collection_date"}
5757

5858

5959
# Map raw zygosity abbreviations to allowed dataclass zygosity values
@@ -110,34 +110,37 @@ def apply_mapping(
110110
list[DiseaseRecord],
111111
list[MeasurementRecord],
112112
list[BiosampleRecord],
113-
]:
113+
]:
114114
"""
115115
1) classify each sheet as genotype / phenotype / disease / measurement / biosample
116116
2) call the matching mapper
117117
"""
118118
# initialize the lists to return
119-
genotype_records: list[Genotype] = []
120-
phenotype_records: list[Phenotype] = []
121-
disease_records: list[DiseaseRecord] = []
122-
measurement_records: list[MeasurementRecord] = []
123-
biosample_records: list[BiosampleRecord] = []
124-
119+
genotype_records: list[Genotype] = []
120+
phenotype_records: list[Phenotype] = []
121+
disease_records: list[DiseaseRecord] = []
122+
measurement_records: list[MeasurementRecord] = []
123+
biosample_records: list[BiosampleRecord] = []
125124

126125
for sheet_name, df in tables.items():
127126
columns = set(df.columns)
128127
""" 1) classify: does this look like genotype, phenotype, or something to skip? """
129128
has_raw = RAW_VARIANT_COLUMNS.issubset(columns)
130129
has_hgvs = bool(HGVS_VARIANT_COLUMNS & columns)
131130
"""Send each sheet to the right extractor and collect all records."""
132-
is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and (has_raw or has_hgvs)
131+
is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and (
132+
has_raw or has_hgvs
133+
)
133134
is_phenotype_sheet = PHENOTYPE_KEY_COLUMNS.issubset(columns)
134135

135136
if is_genotype_sheet == is_phenotype_sheet:
136137
# if we have both raw & HGVS notations, we need to validate that they match
137138
if has_raw and has_hgvs:
138139
self._check_hgvs_consistency(sheet_name, df, notepad)
139140
# ambiguous sheet should give a warning instead of an error
140-
notepad.add_warning(f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype")
141+
notepad.add_warning(
142+
f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype"
143+
)
141144
continue
142145

143146
# rename the former-index column
@@ -157,10 +160,14 @@ def apply_mapping(
157160
disease_records.extend(self._map_disease(sheet_name, working, notepad))
158161
continue
159162
if MEASUREMENT_KEY_COLUMNS.issubset(columns):
160-
measurement_records.extend(self._map_measurement(sheet_name, working, notepad))
163+
measurement_records.extend(
164+
self._map_measurement(sheet_name, working, notepad)
165+
)
161166
continue
162167
if BIOSAMPLE_KEY_COLUMNS.issubset(columns):
163-
biosample_records.extend(self._map_biosample(sheet_name, working, notepad))
168+
biosample_records.extend(
169+
self._map_biosample(sheet_name, working, notepad)
170+
)
164171
continue
165172

166173
return (
@@ -376,26 +383,30 @@ def _map_phenotype(
376383

377384
return records
378385

379-
def _map_disease(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[DiseaseRecord]:
386+
def _map_disease(
387+
self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
388+
) -> list[DiseaseRecord]:
389+
# TODO: implement row→DiseaseRecord, row→MeasurementRecord conversion, and row→BiosampleRecord conversions
380390
"""
381391
Map each row in a disease sheet to a DiseaseRecord.
382392
"""
383-
records: list[DiseaseRecord] = []
384-
# TODO: implement row→DiseaseRecord conversion
393+
# TODO: fix as this is not in use now: records: list[DiseaseRecord] = []
385394
raise NotImplementedError
386395

387-
def _map_measurement(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[MeasurementRecord]:
396+
def _map_measurement(
397+
self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
398+
) -> list[MeasurementRecord]:
388399
"""
389400
Map each row in a measurement sheet to a MeasurementRecord.
390401
"""
391-
records: list[MeasurementRecord] = []
392-
# TODO: implement row→MeasurementRecord conversion
402+
# TODO: fix as this is not in use now: records: list[MeasurementRecord] = []
393403
raise NotImplementedError
394404

395-
def _map_biosample(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[BiosampleRecord]:
405+
def _map_biosample(
406+
self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
407+
) -> list[BiosampleRecord]:
396408
"""
397409
Map each row in a biosample sheet to a BiosampleRecord.
398410
"""
399-
records: list[BiosampleRecord] = []
400-
# TODO: implement row→BiosampleRecord conversion
411+
# TODO: fix as this is not in use now: records: list[BiosampleRecord] = []
401412
raise NotImplementedError

src/P6/measurement.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from dataclasses import dataclass
88

9+
910
@dataclass
1011
class MeasurementRecord:
1112
"""
@@ -18,6 +19,7 @@ class MeasurementRecord:
1819
measurement_unit: Unit CURIE or string (e.g. 'mmol/L').
1920
measurement_timestamp: ISO timestamp string (e.g. '2025-07-15T14:23:00').
2021
"""
22+
2123
patient_ID: str
2224
measurement_type: str
2325
measurement_value: float

tests/conftest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def fpath_hpo(fpath_test_dir: str) -> str:
2424
def hpo(fpath_hpo: str) -> hpotk.MinimalOntology:
2525
return hpotk.load_minimal_ontology(fpath_hpo)
2626

27+
2728
# use the `hpo` already defined above in this file
2829
@pytest.fixture
2930
def decompressed_hpo(fpath_hpo: str, tmp_path: Path) -> str:
@@ -33,4 +34,4 @@ def decompressed_hpo(fpath_hpo: str, tmp_path: Path) -> str:
3334
out = tmp_path / "hp.json"
3435
with gzip.open(fpath_hpo, "rb") as f_in, open(out, "wb") as f_out:
3536
shutil.copyfileobj(f_in, f_out)
36-
return str(out)
37+
return str(out)

tests/test_full_features.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,19 @@
22
import re
33
import glob
44
import json
5-
import pytest
65
from click.testing import CliRunner
76
from P6.__main__ import main
87

98
# Path to the “full” example workbook (e.g. created under tests/data/full_example.xlsx)
10-
FULL_XLSX = os.path.join(os.path.dirname(__file__), "data", "Sydney_Python_transformation.xlsx")
9+
FULL_XLSX = os.path.join(
10+
os.path.dirname(__file__), "data", "Sydney_Python_transformation.xlsx"
11+
)
12+
1113

1214
def test_full_features_parse_creates_all_blocks(decompressed_hpo):
1315
runner = CliRunner()
1416
result = runner.invoke(
15-
main,
16-
[
17-
"parse-excel",
18-
"-e", FULL_XLSX,
19-
"--custom-hpo", decompressed_hpo
20-
],
17+
main, ["parse-excel", "-e", FULL_XLSX, "--custom-hpo", decompressed_hpo]
2118
)
2219
assert result.exit_code == 0, result.output
2320

@@ -32,6 +29,6 @@ def test_full_features_parse_creates_all_blocks(decompressed_hpo):
3229

3330
# Load the first phenopacket and check new keys
3431
pkt = json.load(open(json_files[0], encoding="utf-8"))
35-
assert isinstance(pkt.get("diseases", []), list), "Missing 'diseases' block"
32+
assert isinstance(pkt.get("diseases", []), list), "Missing 'diseases' block"
3633
assert isinstance(pkt.get("measurements", []), list), "Missing 'measurements' block"
37-
assert isinstance(pkt.get("biosamples", []), list), "Missing 'biosamples' block"
34+
assert isinstance(pkt.get("biosamples", []), list), "Missing 'biosamples' block"

0 commit comments

Comments
 (0)