From cae7f1ec884ce02377954712e3773b2e34d4f625 Mon Sep 17 00:00:00 2001 From: Varenya Jain <78588847+VarenyaJ@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:21:26 +0200 Subject: [PATCH 01/36] Clean up TODO asking to write phenopackets to a folder --- src/P6/__main__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 8ac8f4b..57d533a 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -121,11 +121,6 @@ def parse_excel(excel_file: str, hpo_path: typing.Optional[str] = None): # 5) Report any errors or warnings _report_issues(notepad) - # pps = mapper.apply_mapping(all_sheets, notepad) - # assert not notepad.has_errors_or_warnings(include_subsections=True) - # TODO: write phenopackets to a folder - # click.echo(f"Created {len(pps)} Phenotype objects") - # 6) Group results by patient records_by_patient = _group_records_by_patient(genotype_records, phenotype_records) From 01be17c8e29b2724b5ce25157df3a0aa63167127 Mon Sep 17 00:00:00 2001 From: Varenya Jain <78588847+VarenyaJ@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:22:24 +0200 Subject: [PATCH 02/36] Clean TODO for downloading an HPO JSON file from the Monarch repo --- src/P6/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 57d533a..cc23b6c 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -46,7 +46,6 @@ def main(): help="exact HPO release tag (e.g. 2025-03-03 or v2025-03-03)", ) def download(data_dir: str, hpo_version: typing.Optional[str]): - # TODO: download an HPO """ Download a specific or the latest HPO JSON release into the tests/data/ folder. """ From 206e41407b6c3c211a558c9a53a367170815c55e Mon Sep 17 00:00:00 2001 From: Varenya Jain <78588847+VarenyaJ@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:24:06 +0200 Subject: [PATCH 03/36] Clean up TODO for VariationDescriptor and gene_context, and remove TODO for Expression.HGVS --- src/P6/__main__.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index cc23b6c..de76e5a 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -234,9 +234,7 @@ def _write_phenopackets( genomic_interpretation_entry.InterpretationStatus.CONTRIBUTORY ) - # now fill in the VariationDescriptor - # TODO: set this up later - # Omit setting gene_context for now. + # TODO: Revise VariationDescriptor and gene_context later, omit setting gene_context for now. # variation_descriptor = genomic_interpretation_entry.variant_interpretation.variation_descriptor # we can also set variation_descriptor.gene_context and variation_descriptor.allelic_state here then serialize out as before # variation_descriptor.gene_context.gene_symbol = genotype_record.gene_symbol @@ -247,13 +245,13 @@ def _write_phenopackets( variation_descriptor = variant_interpretation.variation_descriptor # 1) Gene symbol & allelic state - # 'gene_context' is a message; you must CopyFrom if setting a message, - # but for its scalar fields you can still assign directly: + # 'gene_context' is a message; we need to CopyFrom if setting a message, + # but for its scalar fields we can still assign directly: variation_descriptor.gene_context.symbol = genotype_record.gene_symbol variation_descriptor.allelic_state.CopyFrom( pps2.OntologyClass( id="GENO:" - + genotype_record.zygosity_code, # or however you construct this + + genotype_record.zygosity_code, # or however we decide to construct this later on label=genotype_record.zygosity, ) ) @@ -285,12 +283,6 @@ def _write_phenopackets( # some protobuffs give trouble when trying to expose location/alleles so just skip pass - # TODO: when ready, add an Expression.HGVS here - # Record the HGVS genomic notation as an Expression - # expr = variation_descriptor.expressions.add() - # expr.syntax = Phenopacket.Diagnosis.GenomicInterpretation.VariantInterpretation.VariationDescriptor.Expression.HGVS - # expr.value = genotype_record.hgvsg - # 3d) Serialize to JSON generated_phenopacket_output_path = ( generated_phenopacket_output_dir / f"{patient_id}.json" From 021b19f6d10e2eeb2fe8789deeb66eb14516182d Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Tue, 5 Aug 2025 12:30:00 +0200 Subject: [PATCH 04/36] Rearrange the mapper --- src/P6/mapper.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index 169967d..42007fa 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -19,14 +19,6 @@ from .phenotype import Phenotype -class TableMapper(metaclass=abc.ABCMeta): - @abc.abstractmethod - def apply_mapping( - self, tables: dict[str, pd.DataFrame], notepad: Notepad - ) -> typing.Sequence[Phenopacket]: - pass - - # For any renamed field, the two neighbors it must sit between EXPECTED_COLUMN_NEIGHBORS = { "start_position": ("chromosome", "end_position"), @@ -71,6 +63,12 @@ def apply_mapping( "denovo": "de_novo_mutation", } +class TableMapper(metaclass=abc.ABCMeta): + @abc.abstractmethod + def apply_mapping( + self, tables: dict[str, pd.DataFrame], notepad: Notepad + ) -> typing.Sequence[Phenopacket]: + pass class DefaultMapper(TableMapper): def __init__(self, hpo: hpotk.MinimalOntology): From ce93fc19d91e8a4728dbb0266646301834543c1c Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Tue, 5 Aug 2025 14:09:44 +0200 Subject: [PATCH 05/36] Check that the sheet has raw coordinates or some other notation, and if both then these notations needs to say the same thing, then add verbose audit output and configurable strictness for variant checks: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Introduce `--verbose` flag in `parse-excel` to show preprocessing audit steps. - Colorize audit messages (info=cyan, warn=yellow, error=red) and indent them for better readability. - Add `--strict-variants` / `--no-strict-variants` option (default: warn) to treat raw↔HGVS mismatches as errors when enabled. - Pass `strict_variants` through to `DefaultMapper` and update `_check_hgvs_consistency` to honor the new strictness setting. --- src/P6/__main__.py | 82 +++++++++++++++++++++++++++++++++++++++++++--- src/P6/mapper.py | 72 +++++++++++++++++++++++++++++++++++----- 2 files changed, 142 insertions(+), 12 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index de76e5a..1f63d04 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -12,7 +12,7 @@ import sys import typing -from collections import defaultdict +from collections import defaultdict, namedtuple from datetime import datetime from google.protobuf.json_format import MessageToJson from stairval.notepad import create_notepad @@ -22,6 +22,7 @@ from .loader import load_sheets_as_tables from .mapper import DefaultMapper +AuditEntry = namedtuple("AuditEntry", ["step", "sheet", "message", "level"]) @click.group() def main(): @@ -76,7 +77,6 @@ def download(data_dir: str, hpo_version: typing.Optional[str]): click.echo(f"Saved HPO JSON to {out}") pass - @main.command(name="parse-excel") @click.option( "-e", @@ -93,7 +93,9 @@ def download(data_dir: str, hpo_version: typing.Optional[str]): type=click.Path(exists=True, dir_okay=False), help="path to a custom HPO JSON file (defaults to tests/data/hp.json)", ) -def parse_excel(excel_file: str, hpo_path: typing.Optional[str] = None): +@click.option("--strict-variants/--no-strict-variants", default=False, help=("Treat raw↔HGVS mismatches as errors (default: warn).")) +@click.option("--verbose", is_flag=True, help="Show preprocessing and classification steps") +def parse_excel(excel_file: str, hpo_path: typing.Optional[str] = None, verbose: bool = False, strict_variants: bool = False): """ Read each sheet, check column order, then: - Identify as a Genotype sheet if ALL GENOTYPE_KEY_COLUMNS are present. @@ -106,13 +108,31 @@ def parse_excel(excel_file: str, hpo_path: typing.Optional[str] = None): # 2) Build ontology and mapper ontology = _load_ontology(str(hpo_file)) - mapper = DefaultMapper(ontology) + mapper = DefaultMapper(ontology, strict_variants=strict_variants) # 3) Read all sheets into DataFrames tables = _read_sheets(excel_file) # tables = load_sheets_as_tables(excel_file) # Just use this for Pandas_Workaround. Don't call declare or call "_read_sheets" at all. Just use `tables = load_sheets_as_tables(excel_file)` which only needs `from .loader import load_sheets_as_tables` # TODO: Decide if it is better to implement `Pandas_Workaround` or just use Pandas + # optionally audit preprocessing + if verbose: + for entry in preprocess(tables): + #click.echo(f"[{entry.level.upper():7}] {entry.step:20} {entry.sheet:15} {entry.message}") + # indent every line… + indent = " " + line = f"{entry.step:20} {entry.sheet:15} {entry.message}" + # color by level + click.echo("") # blank line before mapping output + if entry.level == "error": + colored = click.style(line, fg="red") + elif entry.level in ("warn", "warning"): + colored = click.style(line, fg="yellow") + else: + colored = click.style(line, fg="cyan") + click.echo(indent + colored) + click.echo("") # a blank line before mapping output + # 4) Apply mapping to get raw records and collect issues notepad = create_notepad("phenopackets") genotype_records, phenotype_records = mapper.apply_mapping(tables, notepad) @@ -290,6 +310,60 @@ def _write_phenopackets( with open(generated_phenopacket_output_path, "w", encoding="utf-8") as out_f: out_f.write(MessageToJson(phenopacket)) +def preprocess(tables: dict[str, pd.DataFrame]) -> list[AuditEntry]: + """ + Run lightweight audits on each sheet: + - header normalization + - sheet classification + - variant‐column presence (raw vs HGVS) + """ + from .mapper import ( + RAW_VARIANT_COLUMNS, + HGVS_VARIANT_COLUMNS, + GENOTYPE_BASE_COLUMNS, + PHENOTYPE_KEY_COLUMNS, + ) + + entries: list[AuditEntry] = [] + + # Step 1: header counts + for name, df in tables.items(): + entries.append(AuditEntry( + step="normalize-headers", + sheet=name, + message=f"{len(df.columns)} cols", + level="info", + )) + + # Step 2: classify + for name, df in tables.items(): + cols = set(df.columns) + has_raw = RAW_VARIANT_COLUMNS.issubset(cols) + has_hgvs = bool(HGVS_VARIANT_COLUMNS & cols) + is_gen = GENOTYPE_BASE_COLUMNS.issubset(cols) and (has_raw or has_hgvs) + is_pheno = PHENOTYPE_KEY_COLUMNS.issubset(cols) + + kind = "genotype" if is_gen else "phenotype" if is_pheno else "skip" + entries.append(AuditEntry( + step="classify-sheet", + sheet=name, + message=kind + (f" ({'raw+hgvs' if has_raw and has_hgvs else 'raw' if has_raw else 'hgvs'})"), + level="info", + )) + + # Step 3: variant columns + for name, df in tables.items(): + cols = set(df.columns) + if GENOTYPE_BASE_COLUMNS.issubset(cols): + if not (RAW_VARIANT_COLUMNS.issubset(cols) or HGVS_VARIANT_COLUMNS & cols): + entries.append(AuditEntry( + step="variant-check", + sheet=name, + message="missing raw & HGVS", + level="error", + )) + return entries + if __name__ == "__main__": main() diff --git a/src/P6/mapper.py b/src/P6/mapper.py index 42007fa..bc4872f 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -63,6 +63,13 @@ "denovo": "de_novo_mutation", } +# Check Variant column groups: allow either the full raw coordinates or any HGVS notation +RAW_VARIANT_COLUMNS = {"chromosome", "start_position", "end_position", "reference", "alternate"} +HGVS_VARIANT_COLUMNS = {"hgvsg", "hgvsc", "hgvsp"} +# minimal base columns to call something a genotype sheet (we bring the index in later) +GENOTYPE_BASE_COLUMNS = {"contact_email", "phasing"} + + class TableMapper(metaclass=abc.ABCMeta): @abc.abstractmethod def apply_mapping( @@ -71,8 +78,13 @@ def apply_mapping( pass class DefaultMapper(TableMapper): - def __init__(self, hpo: hpotk.MinimalOntology): + def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): + """ + - False: raw⇄HGVS mismatches are logged as WARNINGS + - True : raw⇄HGVS mismatches are logged as ERRORS + """ self._hpo = hpo + self.strict_variants = strict_variants def apply_mapping( self, tables: dict[str, pd.DataFrame], notepad: Notepad @@ -82,11 +94,17 @@ def apply_mapping( for sheet_name, df in tables.items(): columns = set(df.columns) + """ 1) classify: does this look like genotype, phenotype, or something to skip? """ + has_raw = RAW_VARIANT_COLUMNS.issubset(columns) + has_hgvs = bool(HGVS_VARIANT_COLUMNS & columns) """Send each sheet to the right extractor and collect all records.""" - is_genotype_sheet = GENOTYPE_KEY_COLUMNS.issubset(columns) + is_genotype_sheet = (GENOTYPE_BASE_COLUMNS.issubset(columns) and (has_raw or has_hgvs)) is_phenotype_sheet = PHENOTYPE_KEY_COLUMNS.issubset(columns) if is_genotype_sheet == is_phenotype_sheet: + # if we have both raw & HGVS notations, we need to validate that they match + if has_raw and has_hgvs: + self._check_hgvs_consistency(sheet_name, df, notepad) # ambiguous sheet should give a warning instead of an error notepad.add_warning( f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype" @@ -97,16 +115,54 @@ def apply_mapping( working = self._prepare_sheet(df, is_genotype_sheet) if is_genotype_sheet: - genotype_records.extend( - self._map_genotype(sheet_name, working, notepad) - ) + genotype_records.extend(self._map_genotype(sheet_name, working, notepad)) else: - phenotype_records.extend( - self._map_phenotype(sheet_name, working, notepad) - ) + phenotype_records.extend(self._map_phenotype(sheet_name, working, notepad)) return genotype_records, phenotype_records + def _check_hgvs_consistency( + self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> None: + ''' + If both raw coordinates and HGVS notation are present, ensure that the genotype notations match + ''' + + pattern = re.compile( + r"^(?:chr)?(?P[^:]+):g\.(?P\d+)" + r"(?P[ACGT]+)>(?P[ACGT]+)$", + re.IGNORECASE + ) + + for idx, row in df.iterrows(): + hgvs = str(row.get("hgvsg", "")).strip() + m = pattern.match(hgvs) + if not m: + # always treat malformed HGVS as an error + notepad.add_error(f"Sheet {sheet_name!r}, row {idx}: malformed HGVS g. notation {hgvs!r}") + continue + chromosome_name = m.group("chromosome_name") + mutation_position = int(m.group("mutation_position")) + reference = m.group("reference_allele") + alternative_allele = m.group("alternative_allele") + + # compare against raw columns + mismatch_msg = ( + f"Sheet {sheet_name!r}, row {idx}: HGVS '{hgvs}' disagrees with " + f"raw ({row['chromosome']}:{row['start_position']}-" + f"{row['end_position']} {row['reference']}>{row['alternate']})" + ) + if ( + str(row["chromosome"]) != chromosome_name + or int(row["start_position"]) != mutation_position + or int(row["end_position"]) != mutation_position + or str(row["reference"]) != reference_allele + or str(row["alternate"]) != alternative_allele + ): + if self.strict_variants: + notepad.add_error(mismatch_msg) + else: + notepad.add_warning(mismatch_msg) + def _prepare_sheet(self, df: pd.DataFrame, is_genotype: bool) -> pd.DataFrame: """Bring the index into a column and name it appropriately.""" column_id = "genotype_patient_ID" if is_genotype else "phenotype_patient_ID" From 308511c91fd7a7837c95657c800fb24cc0e3d32c Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Tue, 5 Aug 2025 14:13:59 +0200 Subject: [PATCH 06/36] Fix typo and ruff format --- src/P6/__main__.py | 67 +++++++++++++++++++++++++++++++--------------- src/P6/mapper.py | 46 ++++++++++++++++++++----------- 2 files changed, 76 insertions(+), 37 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 1f63d04..27e23c4 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -24,6 +24,7 @@ AuditEntry = namedtuple("AuditEntry", ["step", "sheet", "message", "level"]) + @click.group() def main(): """P6: Peter's Parse and Processing of Prenatal Particulars via Pandas.""" @@ -77,6 +78,7 @@ def download(data_dir: str, hpo_version: typing.Optional[str]): click.echo(f"Saved HPO JSON to {out}") pass + @main.command(name="parse-excel") @click.option( "-e", @@ -93,9 +95,20 @@ def download(data_dir: str, hpo_version: typing.Optional[str]): type=click.Path(exists=True, dir_okay=False), help="path to a custom HPO JSON file (defaults to tests/data/hp.json)", ) -@click.option("--strict-variants/--no-strict-variants", default=False, help=("Treat raw↔HGVS mismatches as errors (default: warn).")) -@click.option("--verbose", is_flag=True, help="Show preprocessing and classification steps") -def parse_excel(excel_file: str, hpo_path: typing.Optional[str] = None, verbose: bool = False, strict_variants: bool = False): +@click.option( + "--strict-variants/--no-strict-variants", + default=False, + help=("Treat raw↔HGVS mismatches as errors (default: warn)."), +) +@click.option( + "--verbose", is_flag=True, help="Show preprocessing and classification steps" +) +def parse_excel( + excel_file: str, + hpo_path: typing.Optional[str] = None, + verbose: bool = False, + strict_variants: bool = False, +): """ Read each sheet, check column order, then: - Identify as a Genotype sheet if ALL GENOTYPE_KEY_COLUMNS are present. @@ -118,7 +131,7 @@ def parse_excel(excel_file: str, hpo_path: typing.Optional[str] = None, verbose: # optionally audit preprocessing if verbose: for entry in preprocess(tables): - #click.echo(f"[{entry.level.upper():7}] {entry.step:20} {entry.sheet:15} {entry.message}") + # click.echo(f"[{entry.level.upper():7}] {entry.step:20} {entry.sheet:15} {entry.message}") # indent every line… indent = " " line = f"{entry.step:20} {entry.sheet:15} {entry.message}" @@ -310,6 +323,7 @@ def _write_phenopackets( with open(generated_phenopacket_output_path, "w", encoding="utf-8") as out_f: out_f.write(MessageToJson(phenopacket)) + def preprocess(tables: dict[str, pd.DataFrame]) -> list[AuditEntry]: """ Run lightweight audits on each sheet: @@ -328,12 +342,14 @@ def preprocess(tables: dict[str, pd.DataFrame]) -> list[AuditEntry]: # Step 1: header counts for name, df in tables.items(): - entries.append(AuditEntry( - step="normalize-headers", - sheet=name, - message=f"{len(df.columns)} cols", - level="info", - )) + entries.append( + AuditEntry( + step="normalize-headers", + sheet=name, + message=f"{len(df.columns)} cols", + level="info", + ) + ) # Step 2: classify for name, df in tables.items(): @@ -344,24 +360,31 @@ def preprocess(tables: dict[str, pd.DataFrame]) -> list[AuditEntry]: is_pheno = PHENOTYPE_KEY_COLUMNS.issubset(cols) kind = "genotype" if is_gen else "phenotype" if is_pheno else "skip" - entries.append(AuditEntry( - step="classify-sheet", - sheet=name, - message=kind + (f" ({'raw+hgvs' if has_raw and has_hgvs else 'raw' if has_raw else 'hgvs'})"), - level="info", - )) + entries.append( + AuditEntry( + step="classify-sheet", + sheet=name, + message=kind + + ( + f" ({'raw+hgvs' if has_raw and has_hgvs else 'raw' if has_raw else 'hgvs'})" + ), + level="info", + ) + ) # Step 3: variant columns for name, df in tables.items(): cols = set(df.columns) if GENOTYPE_BASE_COLUMNS.issubset(cols): if not (RAW_VARIANT_COLUMNS.issubset(cols) or HGVS_VARIANT_COLUMNS & cols): - entries.append(AuditEntry( - step="variant-check", - sheet=name, - message="missing raw & HGVS", - level="error", - )) + entries.append( + AuditEntry( + step="variant-check", + sheet=name, + message="missing raw & HGVS", + level="error", + ) + ) return entries diff --git a/src/P6/mapper.py b/src/P6/mapper.py index bc4872f..b62a086 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -64,7 +64,13 @@ } # Check Variant column groups: allow either the full raw coordinates or any HGVS notation -RAW_VARIANT_COLUMNS = {"chromosome", "start_position", "end_position", "reference", "alternate"} +RAW_VARIANT_COLUMNS = { + "chromosome", + "start_position", + "end_position", + "reference", + "alternate", +} HGVS_VARIANT_COLUMNS = {"hgvsg", "hgvsc", "hgvsp"} # minimal base columns to call something a genotype sheet (we bring the index in later) GENOTYPE_BASE_COLUMNS = {"contact_email", "phasing"} @@ -77,6 +83,7 @@ def apply_mapping( ) -> typing.Sequence[Phenopacket]: pass + class DefaultMapper(TableMapper): def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): """ @@ -98,7 +105,9 @@ def apply_mapping( has_raw = RAW_VARIANT_COLUMNS.issubset(columns) has_hgvs = bool(HGVS_VARIANT_COLUMNS & columns) """Send each sheet to the right extractor and collect all records.""" - is_genotype_sheet = (GENOTYPE_BASE_COLUMNS.issubset(columns) and (has_raw or has_hgvs)) + is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and ( + has_raw or has_hgvs + ) is_phenotype_sheet = PHENOTYPE_KEY_COLUMNS.issubset(columns) if is_genotype_sheet == is_phenotype_sheet: @@ -115,22 +124,27 @@ def apply_mapping( working = self._prepare_sheet(df, is_genotype_sheet) if is_genotype_sheet: - genotype_records.extend(self._map_genotype(sheet_name, working, notepad)) + genotype_records.extend( + self._map_genotype(sheet_name, working, notepad) + ) else: - phenotype_records.extend(self._map_phenotype(sheet_name, working, notepad)) + phenotype_records.extend( + self._map_phenotype(sheet_name, working, notepad) + ) return genotype_records, phenotype_records def _check_hgvs_consistency( - self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> None: - ''' + self, sheet_name: str, df: pd.DataFrame, notepad: Notepad + ) -> None: + """ If both raw coordinates and HGVS notation are present, ensure that the genotype notations match - ''' + """ pattern = re.compile( r"^(?:chr)?(?P[^:]+):g\.(?P\d+)" r"(?P[ACGT]+)>(?P[ACGT]+)$", - re.IGNORECASE + re.IGNORECASE, ) for idx, row in df.iterrows(): @@ -138,11 +152,13 @@ def _check_hgvs_consistency( m = pattern.match(hgvs) if not m: # always treat malformed HGVS as an error - notepad.add_error(f"Sheet {sheet_name!r}, row {idx}: malformed HGVS g. notation {hgvs!r}") + notepad.add_error( + f"Sheet {sheet_name!r}, row {idx}: malformed HGVS g. notation {hgvs!r}" + ) continue chromosome_name = m.group("chromosome_name") mutation_position = int(m.group("mutation_position")) - reference = m.group("reference_allele") + reference_allele = m.group("reference_allele") alternative_allele = m.group("alternative_allele") # compare against raw columns @@ -152,11 +168,11 @@ def _check_hgvs_consistency( f"{row['end_position']} {row['reference']}>{row['alternate']})" ) if ( - str(row["chromosome"]) != chromosome_name - or int(row["start_position"]) != mutation_position - or int(row["end_position"]) != mutation_position - or str(row["reference"]) != reference_allele - or str(row["alternate"]) != alternative_allele + str(row["chromosome"]) != chromosome_name + or int(row["start_position"]) != mutation_position + or int(row["end_position"]) != mutation_position + or str(row["reference"]) != reference_allele + or str(row["alternate"]) != alternative_allele ): if self.strict_variants: notepad.add_error(mismatch_msg) From 2473094b2cf37380cbd620c2e7078eb4f2ea1cb8 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Tue, 5 Aug 2025 15:20:35 +0200 Subject: [PATCH 07/36] feat(cli): add `audit-excel` command and expand preprocessing tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - **New CLI command** `audit-excel` in `src/P6/__main__.py` - Options: - `-e, --excel-path ` (required) to specify the workbook - `-r, --report-json` flag to toggle JSON output - Reads sheets via existing `_read_sheets()` - Calls `preprocess()` to collect audit entries - Renders either: - Human-readable table (columns: SHEET, STEP, LEVEL, MESSAGE; two spaces between columns for reliable parsing) - JSON array of `{ step, sheet, level, message }` objects with `--report-json` - **Tests for `audit-excel`** in `tests/test_cli_audit_excel.py` - `test_audit_excel_table_output`: - Invokes `audit-excel -e ` - Asserts exit code `0` - Verifies header line begins with `SHEET` - Splits each subsequent line on `\s{2,}` and checks for ≥4 columns - `test_audit_excel_json_output`: - Invokes with `-r` - Parses output as JSON list of dicts - Asserts each dict contains keys `step`, `sheet`, `level`, `message` - **Unit tests for `preprocess()`** in `tests/test_preprocess.py` - Fixture `simple_workbook`: generates a minimal valid genotype sheet in a temp Excel file - `test_preprocess_detects_variant_sheet`: - Loads tables via `load_sheets_as_tables()` - Runs `preprocess()` - Asserts presence of a `classify-sheet` entry for “geno” - Asserts no error‐level `variant-check` entry on valid input - **Non-functional changes** - Imported `json` at top of `__main__.py` - No modifications to existing `parse-excel` or phenopacket serialization logic --- src/P6/__main__.py | 32 +++++++++++++++++++++++++ tests/test_cli_audit_excel.py | 45 +++++++++++++++++++++++++++++++++++ tests/test_preprocess.py | 35 +++++++++++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 tests/test_cli_audit_excel.py create mode 100644 tests/test_preprocess.py diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 27e23c4..64a118b 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -6,6 +6,7 @@ import click import hpotk +import json import pandas as pd # Not needed for Pandas_Workaround, i.e. don't call declare or call "_read_sheets" at all, just use `tables = load_sheets_as_tables(excel_file)` which only needs `from .loader import load_sheets_as_tables` import pathlib import requests @@ -31,6 +32,37 @@ def main(): pass +@main.command(name="audit-excel") +@click.option("-e", "--excel-path", "excel_file", required=True, type=click.Path(exists=True, dir_okay=False), help="path to the Excel workbook") +@click.option("-r", "--report-json", "report_json", is_flag=True, help="output audit report as JSON instead of table") +def audit_excel(excel_file: str, report_json: bool): + """ + Run a preprocessing audit on each sheet in the given workbook: + - header normalization + - sheet classification (genotype/phenotype/skip) + - variant‐column presence checks + """ + # 1) Read sheets + tables = _read_sheets(excel_file) + + # 2) Produce audit entries + from .__main__ import preprocess + entries = preprocess(tables) + + # 3) Render report + if report_json: + # turn each AuditEntry into a serializable dict + payload = [ + {"step": e.step, "sheet": e.sheet, "level": e.level, "message": e.message} + for e in entries + ] + click.echo(json.dumps(payload, indent=2)) + else: + # table header + click.echo(f"{'SHEET':20} {'STEP':25} {'LEVEL':8} MESSAGE") + for e in entries: + click.echo(f"{e.sheet:20} {e.step:25} {e.level:8} {e.message}") + @main.command(name="download") @click.option( "-d", diff --git a/tests/test_cli_audit_excel.py b/tests/test_cli_audit_excel.py new file mode 100644 index 0000000..ef2e8c5 --- /dev/null +++ b/tests/test_cli_audit_excel.py @@ -0,0 +1,45 @@ +import glob +import os +import pytest +import re +from click.testing import CliRunner +from P6.__main__ import main + +@pytest.fixture(scope="session", autouse=True) +def verify_hpo_file(): + # ensure HPO file is in place + data_dir = os.path.join(os.path.dirname(__file__), "data") + assert os.path.isdir(data_dir) + +@pytest.mark.parametrize( + "sample_xlsx", + glob.glob(os.path.join(os.path.dirname(__file__), "data", "*.xlsx")), +) +def test_audit_excel_table_output(sample_xlsx): + runner = CliRunner() + result = runner.invoke(main, ["audit-excel", "-e", sample_xlsx]) + assert result.exit_code == 0, result.output + + # first line should be our header + first = result.output.splitlines()[0] + assert first.startswith("SHEET"), "Expected table header" + # each subsequent line should have 4 columns + for line in result.output.splitlines()[1:]: + parts = re.split(r"\s{2,}", line.strip()) + assert len(parts) >= 4, f"Bad line in audit table: {line}" + +def test_audit_excel_json_output(tmp_path): + # pick any test workbook + sample = glob.glob(os.path.join(os.path.dirname(__file__), "data", "*.xlsx"))[0] + runner = CliRunner() + result = runner.invoke(main, ["audit-excel", "-e", sample, "-r"]) + assert result.exit_code == 0, result.output + + # JSON must parse to a list of dicts + import json + payload = json.loads(result.output) + assert isinstance(payload, list) + assert all(isinstance(obj, dict) for obj in payload) + # check expected keys + for obj in payload: + assert {"step","sheet","level","message"}.issubset(obj.keys()) diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py new file mode 100644 index 0000000..31416af --- /dev/null +++ b/tests/test_preprocess.py @@ -0,0 +1,35 @@ +import pytest +import pandas as pd +from P6.__main__ import preprocess +from P6.loader import load_sheets_as_tables + +@pytest.fixture +def simple_workbook(tmp_path): + # build a tiny Excel with one sheet and minimal columns + df = pd.DataFrame({ + "contact_email": ["a@b.com"], + "phasing": [True], + "chromosome": ["chr1"], + "start_position": [100], + "end_position": [100], + "reference": ["A"], + "alternate": ["T"], + "gene_symbol": ["GENE"], + "hgvsg": ["g.100A>T"], + "hgvsc": [""], + "hgvsp": [""], + "zygosity": ["het"], + "inheritance": ["unknown"], + }, index=["PAT1"]) + path = tmp_path / "wb.xlsx" + with pd.ExcelWriter(path, engine="openpyxl") as w: + df.to_excel(w, sheet_name="geno") + return str(path) + +def test_preprocess_detects_variant_sheet(simple_workbook): + tables = load_sheets_as_tables(simple_workbook) + entries = preprocess(tables) + # we expect at least one classify-sheet entry + assert any(e.step == "classify-sheet" and e.sheet == "geno" for e in entries) + # and no variant-check errors since this is valid + assert not any(e.step == "variant-check" and e.level == "error" for e in entries) From 7e6cf2ee3a9c6234bb237ae43258ef82dd7b5213 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Tue, 5 Aug 2025 15:21:29 +0200 Subject: [PATCH 08/36] Ruff formatting --- src/P6/__main__.py | 21 ++++++++++++++++++--- tests/test_cli_audit_excel.py | 9 ++++++--- tests/test_preprocess.py | 35 ++++++++++++++++++++--------------- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 64a118b..1eca60d 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -33,8 +33,21 @@ def main(): @main.command(name="audit-excel") -@click.option("-e", "--excel-path", "excel_file", required=True, type=click.Path(exists=True, dir_okay=False), help="path to the Excel workbook") -@click.option("-r", "--report-json", "report_json", is_flag=True, help="output audit report as JSON instead of table") +@click.option( + "-e", + "--excel-path", + "excel_file", + required=True, + type=click.Path(exists=True, dir_okay=False), + help="path to the Excel workbook", +) +@click.option( + "-r", + "--report-json", + "report_json", + is_flag=True, + help="output audit report as JSON instead of table", +) def audit_excel(excel_file: str, report_json: bool): """ Run a preprocessing audit on each sheet in the given workbook: @@ -47,6 +60,7 @@ def audit_excel(excel_file: str, report_json: bool): # 2) Produce audit entries from .__main__ import preprocess + entries = preprocess(tables) # 3) Render report @@ -55,7 +69,7 @@ def audit_excel(excel_file: str, report_json: bool): payload = [ {"step": e.step, "sheet": e.sheet, "level": e.level, "message": e.message} for e in entries - ] + ] click.echo(json.dumps(payload, indent=2)) else: # table header @@ -63,6 +77,7 @@ def audit_excel(excel_file: str, report_json: bool): for e in entries: click.echo(f"{e.sheet:20} {e.step:25} {e.level:8} {e.message}") + @main.command(name="download") @click.option( "-d", diff --git a/tests/test_cli_audit_excel.py b/tests/test_cli_audit_excel.py index ef2e8c5..f41e4b4 100644 --- a/tests/test_cli_audit_excel.py +++ b/tests/test_cli_audit_excel.py @@ -5,15 +5,16 @@ from click.testing import CliRunner from P6.__main__ import main + @pytest.fixture(scope="session", autouse=True) def verify_hpo_file(): # ensure HPO file is in place data_dir = os.path.join(os.path.dirname(__file__), "data") assert os.path.isdir(data_dir) + @pytest.mark.parametrize( - "sample_xlsx", - glob.glob(os.path.join(os.path.dirname(__file__), "data", "*.xlsx")), + "sample_xlsx", glob.glob(os.path.join(os.path.dirname(__file__), "data", "*.xlsx")) ) def test_audit_excel_table_output(sample_xlsx): runner = CliRunner() @@ -28,6 +29,7 @@ def test_audit_excel_table_output(sample_xlsx): parts = re.split(r"\s{2,}", line.strip()) assert len(parts) >= 4, f"Bad line in audit table: {line}" + def test_audit_excel_json_output(tmp_path): # pick any test workbook sample = glob.glob(os.path.join(os.path.dirname(__file__), "data", "*.xlsx"))[0] @@ -37,9 +39,10 @@ def test_audit_excel_json_output(tmp_path): # JSON must parse to a list of dicts import json + payload = json.loads(result.output) assert isinstance(payload, list) assert all(isinstance(obj, dict) for obj in payload) # check expected keys for obj in payload: - assert {"step","sheet","level","message"}.issubset(obj.keys()) + assert {"step", "sheet", "level", "message"}.issubset(obj.keys()) diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 31416af..e8d75c4 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -3,29 +3,34 @@ from P6.__main__ import preprocess from P6.loader import load_sheets_as_tables + @pytest.fixture def simple_workbook(tmp_path): # build a tiny Excel with one sheet and minimal columns - df = pd.DataFrame({ - "contact_email": ["a@b.com"], - "phasing": [True], - "chromosome": ["chr1"], - "start_position": [100], - "end_position": [100], - "reference": ["A"], - "alternate": ["T"], - "gene_symbol": ["GENE"], - "hgvsg": ["g.100A>T"], - "hgvsc": [""], - "hgvsp": [""], - "zygosity": ["het"], - "inheritance": ["unknown"], - }, index=["PAT1"]) + df = pd.DataFrame( + { + "contact_email": ["a@b.com"], + "phasing": [True], + "chromosome": ["chr1"], + "start_position": [100], + "end_position": [100], + "reference": ["A"], + "alternate": ["T"], + "gene_symbol": ["GENE"], + "hgvsg": ["g.100A>T"], + "hgvsc": [""], + "hgvsp": [""], + "zygosity": ["het"], + "inheritance": ["unknown"], + }, + index=["PAT1"], + ) path = tmp_path / "wb.xlsx" with pd.ExcelWriter(path, engine="openpyxl") as w: df.to_excel(w, sheet_name="geno") return str(path) + def test_preprocess_detects_variant_sheet(simple_workbook): tables = load_sheets_as_tables(simple_workbook) entries = preprocess(tables) From e85b4b3dff406b8757bc45bbaa51b79899a5ea64 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Tue, 5 Aug 2025 15:32:22 +0200 Subject: [PATCH 09/36] Augment README --- README.md | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 82e5221..d1e7eb5 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,12 @@ A simple, extensible CLI for downloading the Human Phenotype Ontology, parsing g 3. [Installation](#installation) 4. [Quickstart](#quickstart) - [Download HPO JSON](#download-hpo-json) - - [Parse Excel to Phenopackets](#parse-excel-to-phenopackets) + - [Parse Excel to Phenopackets](#parse-excel-to-phenopackets) + - [Audit Excel Workbooks](#audit-excel-workbooks) 5. [CLI Reference](#cli-reference) - [`p6 download`](#p6-download) - [`p6 parse-excel`](#p6-parse-excel) + - [`p6 audit-excel`](#p6-audit-excel) 6. [Development & Testing](#development--testing) 7. [Contributing](#contributing) 8. [License](#license) @@ -94,6 +96,18 @@ Resulting phenopacket files will be under: phenopacket_from_excel/$(date "+%Y-%m-%d_%H-%M-%S")/phenopackets/ ``` +### Audit Excel Workbooks + +Quickly check each sheet in an Excel file for header normalization, sheet classification, and presence of required variant columns. +```bash +p6 audit-excel -e tests/data/Sydney_Python_transformation.xlsx +``` + +By default you get a table; use `-r` for a JSON output to the console. +```bash +p6 audit-excel -e tests/data/Sydney_Python_transformation.xlsx -r +``` + ## CLI Reference ### p6 download @@ -101,11 +115,13 @@ phenopacket_from_excel/$(date "+%Y-%m-%d_%H-%M-%S")/phenopackets/ Usage: ```markdown p6 download [OPTIONS] +``` Options: - -d, --data-path PATH where to save HPO JSON (default: tests/data) - -v, --hpo-version TEXT exact HPO release tag (e.g. 2025-03-03 or v2025-03-03) - --help Show this help message and exit. +```markdown + -d, --data-path PATH where to save HPO JSON (default: tests/data) + -v, --hpo-version TEXT exact HPO release tag (e.g. 2025-03-03 or v2025-03-03) + --help Show this help message and exit. ``` Examples: @@ -130,9 +146,9 @@ Usage: `p6 parse-excel [OPTIONS] EXCEL_FILE` Options: ```markdown - -e, --excel-path FILE path to the Excel workbook [required] - -hpo, --custom-hpo FILE path to a custom HPO JSON file (defaults to `tests/data/hp.json`) - --help Show this message and exit. + -e, --excel-path FILE path to the Excel workbook [required] + -hpo, --custom-hpo FILE path to a custom HPO JSON file (defaults to `tests/data/hp.json`) + --help Show this message and exit. ``` Example: @@ -142,6 +158,19 @@ Explicitly point at a custom HPO file: p6 parse-excel -e tests/data/Sydney_Python_transformation.xlsx -hpo src/P6/hp.json ``` +### p6 audit-excel + +Run a lightweight audit on each sheet in an Excel workbook, reporting header counts, sheet classification, and missing variant‐column checks. + +Usage: `p6 audit-excel [OPTIONS] EXCEL_FILE` + +Options: +```markdown + -e, --excel-path FILE path to the Excel workbook [required] + -r, --report-json output audit report as JSON instead of table + --help Show this message and exit. +``` + ## Development & Testing Install dev requirements: From 9744ee986f87ef321a78a890d602e9b635f4a80a Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 12:08:51 +0200 Subject: [PATCH 10/36] Create new files for optional categories to fill out Phenopacket Schema --- src/P6/biosample.py | 23 +++++++++++++++++++++++ src/P6/disease.py | 25 +++++++++++++++++++++++++ src/P6/measurement.py | 25 +++++++++++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 src/P6/biosample.py create mode 100644 src/P6/disease.py create mode 100644 src/P6/measurement.py diff --git a/src/P6/biosample.py b/src/P6/biosample.py new file mode 100644 index 0000000..0f58cee --- /dev/null +++ b/src/P6/biosample.py @@ -0,0 +1,23 @@ +""" +Biosample domain model. + +Defines the BiosampleRecord dataclass for capturing sample metadata. +""" + +from dataclasses import dataclass + +@dataclass +class BiosampleRecord: + """ + Represents a biosample entry for a patient. + + Attributes: + patient_ID: Unique alphanumeric patient identifier. + biosample_id: Unique identifier for the biosample. + biosample_type: CURIE of the tissue or sample type (e.g. 'UBERON:0002107'). + collection_date: Date string in 'YYYY-MM-DD' format. + """ + patient_ID: str + biosample_id: str + biosample_type: str + collection_date: str diff --git a/src/P6/disease.py b/src/P6/disease.py new file mode 100644 index 0000000..2993d40 --- /dev/null +++ b/src/P6/disease.py @@ -0,0 +1,25 @@ +""" +Disease domain model. + +Defines the DiseaseRecord dataclass for capturing disease annotations. +""" + +from dataclasses import dataclass + +@dataclass +class DiseaseRecord: + """ + Represents a disease entry for a patient. + + Attributes: + patient_ID: Unique alphanumeric patient identifier. + disease_term: CURIE of the disease term (e.g. 'OMIM:266600'). + disease_label: Human-readable label for the disease. + disease_onset: Date string in 'YYYY-MM-DD' format. + disease_status: True if the disease is present, False if excluded. + """ + patient_ID: str + disease_term: str + disease_label: str + disease_onset: str + disease_status: bool diff --git a/src/P6/measurement.py b/src/P6/measurement.py new file mode 100644 index 0000000..340c331 --- /dev/null +++ b/src/P6/measurement.py @@ -0,0 +1,25 @@ +""" +Measurement domain model. + +Defines the MeasurementRecord dataclass for capturing quantitative measurements. +""" + +from dataclasses import dataclass + +@dataclass +class MeasurementRecord: + """ + Represents a measurement entry for a patient. + + Attributes: + patient_ID: Unique alphanumeric patient identifier. + measurement_type: CURIE of measurement type (e.g. 'LOINC:4548-4'). + measurement_value: Numeric value of the measurement. + measurement_unit: Unit CURIE or string (e.g. 'mmol/L'). + measurement_timestamp: ISO timestamp string (e.g. '2025-07-15T14:23:00'). + """ + patient_ID: str + measurement_type: str + measurement_value: float + measurement_unit: str + measurement_timestamp: str From bb73625892874d614b2305682f0a3da6325a4b50 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 12:37:45 +0200 Subject: [PATCH 11/36] Start to implement parsing of optional sheets to further fill out the outputted phenopackets --- src/P6/__main__.py | 31 +++++++++++++++++++++++++--- src/P6/loader.py | 11 ++++++++++ src/P6/mapper.py | 51 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 88 insertions(+), 5 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 1eca60d..698edc5 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -195,7 +195,7 @@ def parse_excel( # 4) Apply mapping to get raw records and collect issues notepad = create_notepad("phenopackets") - genotype_records, phenotype_records = mapper.apply_mapping(tables, notepad) + genotype_records, phenotype_records, disease_records, measurement_records, biosample_records= mapper.apply_mapping(tables, notepad) # 5) Report any errors or warnings _report_issues(notepad) @@ -255,14 +255,20 @@ def _report_issues(notepad): def _group_records_by_patient( - genotype_records: list, phenotype_records: list + genotype_records: list, phenotype_records: list, disease_records: list, measurement_records: list, biosample_records: list ) -> dict[str, dict[str, list]]: # Group genotype & phenotype records by patient ID - records = defaultdict(lambda: {"genotype_records": [], "phenotype_records": []}) + records = defaultdict(lambda: {"genotype_records": [], "phenotype_records": [], "disease_records": [], "measurement_records": [], "biosample_records": []}) for genotype in genotype_records: records[genotype.genotype_patient_ID]["genotype_records"].append(genotype) for phenotype in phenotype_records: records[phenotype.phenotype_patient_ID]["phenotype_records"].append(phenotype) + for disease in disease_records: + records[disease.patient_ID]["disease_records"].append(disease) + for measurement in measurement_records: + records[measurement.patient_ID]["measurement_records"].append(measurement) + for biosample in biosample_records: + records[biosample.patient_ID]["biosample_records"].append(biosample) return records @@ -363,6 +369,25 @@ def _write_phenopackets( # some protobuffs give trouble when trying to expose location/alleles so just skip pass + # 3c) Add optional entries (if any): + for d in patient_data["disease_records"]: + ds = phenopacket.diseases.add() + ds.term.id = d.disease_term + ds.term.label = d.disease_label + ds.onset = d.disease_onset + ds.status = d.disease_status + for m in patient_data["measurement_records"]: + meas = phenopacket.measurements.add() + meas.type.id = m.measurement_type + meas.value = m.measurement_value + meas.unit = m.measurement_unit + meas.timestamp = m.measurement_timestamp + for b in patient_data["biosample_records"]: + bs = phenopacket.biosamples.add() + bs.id = b.biosample_id + bs.type.id = b.biosample_type + bs.collection_time = b.collection_date + # 3d) Serialize to JSON generated_phenopacket_output_path = ( generated_phenopacket_output_dir / f"{patient_id}.json" diff --git a/src/P6/loader.py b/src/P6/loader.py index a353d95..6cc8f55 100644 --- a/src/P6/loader.py +++ b/src/P6/loader.py @@ -13,6 +13,17 @@ "hpo": "hpo_id", "hpo_term": "hpo_id", # also catch "HPO Term" → hpo_term → hpo_id "timestamp": "date_of_observation", + "disease_term": "disease_term", + "disease_label": "disease_label", + "disease_onset": "disease_onset", + "disease_status": "disease_status", + "measurement_type": "measurement_type", + "measurement_value": "measurement_value", + "measurement_unit": "measurement_unit", + "measurement_timestamp": "measurement_timestamp", + "biosample_id": "biosample_id", + "biosample_type": "biosample_type", + "collection_date": "collection_date", } diff --git a/src/P6/mapper.py b/src/P6/mapper.py index b62a086..e70cea8 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -15,7 +15,10 @@ from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket from stairval.notepad import Notepad +from .biosample import BiosampleRecord +from .disease import DiseaseRecord from .genotype import Genotype +from .measurement import MeasurementRecord from .phenotype import Phenotype @@ -47,6 +50,12 @@ PHENOTYPE_KEY_COLUMNS = {"hpo_id", "date_of_observation", "status"} +# Key columns to identify additional sheets +DISEASE_KEY_COLUMNS = {"disease_term", "disease_onset"} +MEASUREMENT_KEY_COLUMNS = {"measurement_type", "measurement_value", "measurement_unit"} +BIOSAMPLE_KEY_COLUMNS = {"biosample_id", "biosample_type", "collection_date"} + + # Map raw zygosity abbreviations to allowed dataclass zygosity values ZYGOSITY_MAP = { "het": "heterozygous", @@ -95,9 +104,12 @@ def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): def apply_mapping( self, tables: dict[str, pd.DataFrame], notepad: Notepad - ) -> tuple[list[Genotype], list[Phenotype]]: + ) -> tuple[list[Genotype], list[Phenotype], list[DiseaseRecord], list[MeasurementRecord], list[BiosampleRecord]]: genotype_records: list[Genotype] = [] phenotype_records: list[Phenotype] = [] + disease_records: list[DiseaseRecord] = [] + measurement_records: list[MeasurementRecord] = [] + biosample_records: list[BiosampleRecord] = [] for sheet_name, df in tables.items(): columns = set(df.columns) @@ -132,7 +144,18 @@ def apply_mapping( self._map_phenotype(sheet_name, working, notepad) ) - return genotype_records, phenotype_records + # New Fields + if DISEASE_KEY_COLUMNS.issubset(columns): + disease_records.extend(self._map_disease(sheet_name, working, notepad)) + continue + if MEASUREMENT_KEY_COLUMNS.issubset(columns): + measurement_records.extend(self._map_measurement(sheet_name, working, notepad)) + continue + if BIOSAMPLE_KEY_COLUMNS.issubset(columns): + biosample_records.extend(self._map_biosample(sheet_name, working, notepad)) + continue + + return genotype_records, phenotype_records, disease_records, measurement_records, biosample_records, biosample_records def _check_hgvs_consistency( self, sheet_name: str, df: pd.DataFrame, notepad: Notepad @@ -338,3 +361,27 @@ def _map_phenotype( notepad.add_warning(msg) return records + + def _map_disease(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[DiseaseRecord]: + """ + Map each row in a disease sheet to a DiseaseRecord. + """ + records: list[DiseaseRecord] = [] + # TODO: implement row→DiseaseRecord conversion + raise NotImplementedError + + def _map_measurement(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[MeasurementRecord]: + """ + Map each row in a measurement sheet to a MeasurementRecord. + """ + records: list[MeasurementRecord] = [] + # TODO: implement row→MeasurementRecord conversion + raise NotImplementedError + + def _map_biosample(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[BiosampleRecord]: + """ + Map each row in a biosample sheet to a BiosampleRecord. + """ + records: list[BiosampleRecord] = [] + # TODO: implement row→BiosampleRecord conversion + raise NotImplementedError From 55a9c5c5f5e43fb118df023cbbeb95c9c94ead0d Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 12:38:04 +0200 Subject: [PATCH 12/36] Write a test for the new disease, measurements, and biosample implementations --- tests/test_full_features.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 tests/test_full_features.py diff --git a/tests/test_full_features.py b/tests/test_full_features.py new file mode 100644 index 0000000..e518e5c --- /dev/null +++ b/tests/test_full_features.py @@ -0,0 +1,37 @@ +import os +import re +import glob +import json +import pytest +from click.testing import CliRunner +from P6.__main__ import main + +# Path to the “full” example workbook (e.g. created under tests/data/full_example.xlsx) +FULL_XLSX = os.path.join(os.path.dirname(__file__), "data", "Sydney_Python_transformation.xlsx") + +def test_full_features_parse_creates_all_blocks(decompressed_hpo): + runner = CliRunner() + result = runner.invoke( + main, + [ + "parse-excel", + "-e", FULL_XLSX, + "--custom-hpo", decompressed_hpo + ], + ) + assert result.exit_code == 0, result.output + + # Extract the output directory from the summary line + m = re.search(r"Wrote \d+ phenopacket files to (.+)", result.output) + assert m, "Did not find output directory in CLI summary" + outdir = m.group(1).strip() + + # There should be at least one JSON file + json_files = glob.glob(os.path.join(outdir, "*.json")) + assert json_files, f"No JSON files written to {outdir}" + + # Load the first phenopacket and check new keys + pkt = json.load(open(json_files[0], encoding="utf-8")) + assert isinstance(pkt.get("diseases", []), list), "Missing 'diseases' block" + assert isinstance(pkt.get("measurements", []), list), "Missing 'measurements' block" + assert isinstance(pkt.get("biosamples", []), list), "Missing 'biosamples' block" From 30081c891355969fd706c37577e52866c4b5040e Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 12:41:31 +0200 Subject: [PATCH 13/36] Fix indentation errors :( --- src/P6/mapper.py | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index e70cea8..3d6a98e 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -104,12 +104,7 @@ def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): def apply_mapping( self, tables: dict[str, pd.DataFrame], notepad: Notepad - ) -> tuple[list[Genotype], list[Phenotype], list[DiseaseRecord], list[MeasurementRecord], list[BiosampleRecord]]: - genotype_records: list[Genotype] = [] - phenotype_records: list[Phenotype] = [] - disease_records: list[DiseaseRecord] = [] - measurement_records: list[MeasurementRecord] = [] - biosample_records: list[BiosampleRecord] = [] + ) -> tuple[list[Genotype] = [], list[Phenotype] = [], list[DiseaseRecord] = [], list[MeasurementRecord] = [], list[BiosampleRecord] = []]: for sheet_name, df in tables.items(): columns = set(df.columns) @@ -155,7 +150,7 @@ def apply_mapping( biosample_records.extend(self._map_biosample(sheet_name, working, notepad)) continue - return genotype_records, phenotype_records, disease_records, measurement_records, biosample_records, biosample_records + return genotype_records, phenotype_records, disease_records, measurement_records, biosample_records def _check_hgvs_consistency( self, sheet_name: str, df: pd.DataFrame, notepad: Notepad @@ -363,25 +358,25 @@ def _map_phenotype( return records def _map_disease(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[DiseaseRecord]: - """ - Map each row in a disease sheet to a DiseaseRecord. - """ - records: list[DiseaseRecord] = [] - # TODO: implement row→DiseaseRecord conversion - raise NotImplementedError + """ + Map each row in a disease sheet to a DiseaseRecord. + """ + records: list[DiseaseRecord] = [] + # TODO: implement row→DiseaseRecord conversion + raise NotImplementedError def _map_measurement(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[MeasurementRecord]: - """ - Map each row in a measurement sheet to a MeasurementRecord. - """ - records: list[MeasurementRecord] = [] - # TODO: implement row→MeasurementRecord conversion - raise NotImplementedError + """ + Map each row in a measurement sheet to a MeasurementRecord. + """ + records: list[MeasurementRecord] = [] + # TODO: implement row→MeasurementRecord conversion + raise NotImplementedError def _map_biosample(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[BiosampleRecord]: - """ - Map each row in a biosample sheet to a BiosampleRecord. - """ - records: list[BiosampleRecord] = [] - # TODO: implement row→BiosampleRecord conversion - raise NotImplementedError + """ + Map each row in a biosample sheet to a BiosampleRecord. + """ + records: list[BiosampleRecord] = [] + # TODO: implement row→BiosampleRecord conversion + raise NotImplementedError From 15d45b24ecef7ea53c43afdd09ee6cc4e5317671 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 12:43:45 +0200 Subject: [PATCH 14/36] unpack & group all five lists in __main__.py --- src/P6/__main__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 698edc5..78b88f1 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -195,13 +195,19 @@ def parse_excel( # 4) Apply mapping to get raw records and collect issues notepad = create_notepad("phenopackets") - genotype_records, phenotype_records, disease_records, measurement_records, biosample_records= mapper.apply_mapping(tables, notepad) + ( + genotype_records, + phenotype_records, + disease_records, + measurement_records, + biosample_records + ) = mapper.apply_mapping(tables, notepad) # 5) Report any errors or warnings _report_issues(notepad) # 6) Group results by patient - records_by_patient = _group_records_by_patient(genotype_records, phenotype_records) + records_by_patient = _group_records_by_patient(genotype_records, phenotype_records, disease_records, measurement_records, biosample_records) # 7) Prepare output directory with timestamp # Will contain genotype and phenotype records as JSON From 4a471b97f2a03b195e87232423e86edfa98b6b6e Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 14:02:34 +0200 Subject: [PATCH 15/36] Correct typos --- src/P6/mapper.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index 3d6a98e..c6d3d22 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -104,7 +104,24 @@ def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): def apply_mapping( self, tables: dict[str, pd.DataFrame], notepad: Notepad - ) -> tuple[list[Genotype] = [], list[Phenotype] = [], list[DiseaseRecord] = [], list[MeasurementRecord] = [], list[BiosampleRecord] = []]: + ) -> tuple[ + list[Genotype], + list[Phenotype], + list[DiseaseRecord], + list[MeasurementRecord], + list[BiosampleRecord], + ]: + """ + 1) classify each sheet as genotype / phenotype / disease / measurement / biosample + 2) call the matching mapper + """ + # initialize the lists to return + genotype_records: list[Genotype] = [] + phenotype_records: list[Phenotype] = [] + disease_records: list[DiseaseRecord] = [] + measurement_records: list[MeasurementRecord] = [] + biosample_records: list[BiosampleRecord] = [] + for sheet_name, df in tables.items(): columns = set(df.columns) @@ -112,9 +129,7 @@ def apply_mapping( has_raw = RAW_VARIANT_COLUMNS.issubset(columns) has_hgvs = bool(HGVS_VARIANT_COLUMNS & columns) """Send each sheet to the right extractor and collect all records.""" - is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and ( - has_raw or has_hgvs - ) + is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and (has_raw or has_hgvs) is_phenotype_sheet = PHENOTYPE_KEY_COLUMNS.issubset(columns) if is_genotype_sheet == is_phenotype_sheet: @@ -122,9 +137,7 @@ def apply_mapping( if has_raw and has_hgvs: self._check_hgvs_consistency(sheet_name, df, notepad) # ambiguous sheet should give a warning instead of an error - notepad.add_warning( - f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype" - ) + notepad.add_warning(f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype") continue # rename the former-index column @@ -150,7 +163,13 @@ def apply_mapping( biosample_records.extend(self._map_biosample(sheet_name, working, notepad)) continue - return genotype_records, phenotype_records, disease_records, measurement_records, biosample_records + return ( + genotype_records, + phenotype_records, + disease_records, + measurement_records, + biosample_records, + ) def _check_hgvs_consistency( self, sheet_name: str, df: pd.DataFrame, notepad: Notepad From b42ee163c2a464d8aea670d041bd023ffd0cb333 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 14:03:58 +0200 Subject: [PATCH 16/36] Adjust indentation of the for loop --- src/P6/mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index c6d3d22..f315118 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -123,7 +123,7 @@ def apply_mapping( biosample_records: list[BiosampleRecord] = [] - for sheet_name, df in tables.items(): + for sheet_name, df in tables.items(): columns = set(df.columns) """ 1) classify: does this look like genotype, phenotype, or something to skip? """ has_raw = RAW_VARIANT_COLUMNS.issubset(columns) From 94d34247335d743201c33ab848489432e9c2b5a1 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 14:05:50 +0200 Subject: [PATCH 17/36] Adjust indentation --- src/P6/mapper.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index f315118..b6b42e7 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -111,19 +111,19 @@ def apply_mapping( list[MeasurementRecord], list[BiosampleRecord], ]: - """ - 1) classify each sheet as genotype / phenotype / disease / measurement / biosample - 2) call the matching mapper - """ - # initialize the lists to return - genotype_records: list[Genotype] = [] - phenotype_records: list[Phenotype] = [] - disease_records: list[DiseaseRecord] = [] - measurement_records: list[MeasurementRecord] = [] - biosample_records: list[BiosampleRecord] = [] - - - for sheet_name, df in tables.items(): + """ + 1) classify each sheet as genotype / phenotype / disease / measurement / biosample + 2) call the matching mapper + """ + # initialize the lists to return + genotype_records: list[Genotype] = [] + phenotype_records: list[Phenotype] = [] + disease_records: list[DiseaseRecord] = [] + measurement_records: list[MeasurementRecord] = [] + biosample_records: list[BiosampleRecord] = [] + + + for sheet_name, df in tables.items(): columns = set(df.columns) """ 1) classify: does this look like genotype, phenotype, or something to skip? """ has_raw = RAW_VARIANT_COLUMNS.issubset(columns) From 3834c4342ed3f0a0d68170753996468a32c31fd2 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 14:10:08 +0200 Subject: [PATCH 18/36] Adjust fixture since the CLI is expecting a PATH as a string --- tests/conftest.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3262d60..bf0fd40 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,10 @@ +import gzip +import hpotk import os import pytest +import shutil -import hpotk +from pathlib import Path @pytest.fixture(scope="session") @@ -20,3 +23,14 @@ def fpath_hpo(fpath_test_dir: str) -> str: @pytest.fixture(scope="session") def hpo(fpath_hpo: str) -> hpotk.MinimalOntology: return hpotk.load_minimal_ontology(fpath_hpo) + +# use the `hpo` already defined above in this file +@pytest.fixture +def decompressed_hpo(fpath_hpo: str, tmp_path: Path) -> str: + """ + Decompress the gzipped HPO JSON to a plain JSON file and return its path. + """ + out = tmp_path / "hp.json" + with gzip.open(fpath_hpo, "rb") as f_in, open(out, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return str(out) \ No newline at end of file From 86028e07904b58719dfdff0f52c2d724aa15487a Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 14:12:39 +0200 Subject: [PATCH 19/36] Add TODO --- src/P6/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 78b88f1..94abb88 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -202,6 +202,7 @@ def parse_excel( measurement_records, biosample_records ) = mapper.apply_mapping(tables, notepad) + # TODO: Come back and add more top-level fields # 5) Report any errors or warnings _report_issues(notepad) From 86761a1ef4258a51adcee85802cbd953609ae220 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 7 Aug 2025 14:21:25 +0200 Subject: [PATCH 20/36] Adjust TODOs and initiate Ruff formatting; --- src/P6/__main__.py | 26 +++++++++++++++--- src/P6/biosample.py | 2 ++ src/P6/disease.py | 2 ++ src/P6/mapper.py | 55 ++++++++++++++++++++++--------------- src/P6/measurement.py | 2 ++ tests/conftest.py | 3 +- tests/test_full_features.py | 17 +++++------- 7 files changed, 70 insertions(+), 37 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 94abb88..774a973 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -200,7 +200,7 @@ def parse_excel( phenotype_records, disease_records, measurement_records, - biosample_records + biosample_records, ) = mapper.apply_mapping(tables, notepad) # TODO: Come back and add more top-level fields @@ -208,7 +208,13 @@ def parse_excel( _report_issues(notepad) # 6) Group results by patient - records_by_patient = _group_records_by_patient(genotype_records, phenotype_records, disease_records, measurement_records, biosample_records) + records_by_patient = _group_records_by_patient( + genotype_records, + phenotype_records, + disease_records, + measurement_records, + biosample_records, + ) # 7) Prepare output directory with timestamp # Will contain genotype and phenotype records as JSON @@ -262,10 +268,22 @@ def _report_issues(notepad): def _group_records_by_patient( - genotype_records: list, phenotype_records: list, disease_records: list, measurement_records: list, biosample_records: list + genotype_records: list, + phenotype_records: list, + disease_records: list, + measurement_records: list, + biosample_records: list, ) -> dict[str, dict[str, list]]: # Group genotype & phenotype records by patient ID - records = defaultdict(lambda: {"genotype_records": [], "phenotype_records": [], "disease_records": [], "measurement_records": [], "biosample_records": []}) + records = defaultdict( + lambda: { + "genotype_records": [], + "phenotype_records": [], + "disease_records": [], + "measurement_records": [], + "biosample_records": [], + } + ) for genotype in genotype_records: records[genotype.genotype_patient_ID]["genotype_records"].append(genotype) for phenotype in phenotype_records: diff --git a/src/P6/biosample.py b/src/P6/biosample.py index 0f58cee..13dbbf6 100644 --- a/src/P6/biosample.py +++ b/src/P6/biosample.py @@ -6,6 +6,7 @@ from dataclasses import dataclass + @dataclass class BiosampleRecord: """ @@ -17,6 +18,7 @@ class BiosampleRecord: biosample_type: CURIE of the tissue or sample type (e.g. 'UBERON:0002107'). collection_date: Date string in 'YYYY-MM-DD' format. """ + patient_ID: str biosample_id: str biosample_type: str diff --git a/src/P6/disease.py b/src/P6/disease.py index 2993d40..d044623 100644 --- a/src/P6/disease.py +++ b/src/P6/disease.py @@ -6,6 +6,7 @@ from dataclasses import dataclass + @dataclass class DiseaseRecord: """ @@ -18,6 +19,7 @@ class DiseaseRecord: disease_onset: Date string in 'YYYY-MM-DD' format. disease_status: True if the disease is present, False if excluded. """ + patient_ID: str disease_term: str disease_label: str diff --git a/src/P6/mapper.py b/src/P6/mapper.py index b6b42e7..ebfe51a 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -51,9 +51,9 @@ PHENOTYPE_KEY_COLUMNS = {"hpo_id", "date_of_observation", "status"} # Key columns to identify additional sheets -DISEASE_KEY_COLUMNS = {"disease_term", "disease_onset"} +DISEASE_KEY_COLUMNS = {"disease_term", "disease_onset"} MEASUREMENT_KEY_COLUMNS = {"measurement_type", "measurement_value", "measurement_unit"} -BIOSAMPLE_KEY_COLUMNS = {"biosample_id", "biosample_type", "collection_date"} +BIOSAMPLE_KEY_COLUMNS = {"biosample_id", "biosample_type", "collection_date"} # Map raw zygosity abbreviations to allowed dataclass zygosity values @@ -110,18 +110,17 @@ def apply_mapping( list[DiseaseRecord], list[MeasurementRecord], list[BiosampleRecord], - ]: + ]: """ 1) classify each sheet as genotype / phenotype / disease / measurement / biosample 2) call the matching mapper """ # initialize the lists to return - genotype_records: list[Genotype] = [] - phenotype_records: list[Phenotype] = [] - disease_records: list[DiseaseRecord] = [] - measurement_records: list[MeasurementRecord] = [] - biosample_records: list[BiosampleRecord] = [] - + genotype_records: list[Genotype] = [] + phenotype_records: list[Phenotype] = [] + disease_records: list[DiseaseRecord] = [] + measurement_records: list[MeasurementRecord] = [] + biosample_records: list[BiosampleRecord] = [] for sheet_name, df in tables.items(): columns = set(df.columns) @@ -129,7 +128,9 @@ def apply_mapping( has_raw = RAW_VARIANT_COLUMNS.issubset(columns) has_hgvs = bool(HGVS_VARIANT_COLUMNS & columns) """Send each sheet to the right extractor and collect all records.""" - is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and (has_raw or has_hgvs) + is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and ( + has_raw or has_hgvs + ) is_phenotype_sheet = PHENOTYPE_KEY_COLUMNS.issubset(columns) if is_genotype_sheet == is_phenotype_sheet: @@ -137,7 +138,9 @@ def apply_mapping( if has_raw and has_hgvs: self._check_hgvs_consistency(sheet_name, df, notepad) # ambiguous sheet should give a warning instead of an error - notepad.add_warning(f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype") + notepad.add_warning( + f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype" + ) continue # rename the former-index column @@ -157,10 +160,14 @@ def apply_mapping( disease_records.extend(self._map_disease(sheet_name, working, notepad)) continue if MEASUREMENT_KEY_COLUMNS.issubset(columns): - measurement_records.extend(self._map_measurement(sheet_name, working, notepad)) + measurement_records.extend( + self._map_measurement(sheet_name, working, notepad) + ) continue if BIOSAMPLE_KEY_COLUMNS.issubset(columns): - biosample_records.extend(self._map_biosample(sheet_name, working, notepad)) + biosample_records.extend( + self._map_biosample(sheet_name, working, notepad) + ) continue return ( @@ -376,26 +383,30 @@ def _map_phenotype( return records - def _map_disease(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[DiseaseRecord]: + def _map_disease( + self, sheet_name: str, df: pd.DataFrame, notepad: Notepad + ) -> list[DiseaseRecord]: + # TODO: implement row→DiseaseRecord, row→MeasurementRecord conversion, and row→BiosampleRecord conversions """ Map each row in a disease sheet to a DiseaseRecord. """ - records: list[DiseaseRecord] = [] - # TODO: implement row→DiseaseRecord conversion + # TODO: fix as this is not in use now: records: list[DiseaseRecord] = [] raise NotImplementedError - def _map_measurement(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[MeasurementRecord]: + def _map_measurement( + self, sheet_name: str, df: pd.DataFrame, notepad: Notepad + ) -> list[MeasurementRecord]: """ Map each row in a measurement sheet to a MeasurementRecord. """ - records: list[MeasurementRecord] = [] - # TODO: implement row→MeasurementRecord conversion + # TODO: fix as this is not in use now: records: list[MeasurementRecord] = [] raise NotImplementedError - def _map_biosample(self, sheet_name: str, df: pd.DataFrame, notepad: Notepad) -> list[BiosampleRecord]: + def _map_biosample( + self, sheet_name: str, df: pd.DataFrame, notepad: Notepad + ) -> list[BiosampleRecord]: """ Map each row in a biosample sheet to a BiosampleRecord. """ - records: list[BiosampleRecord] = [] - # TODO: implement row→BiosampleRecord conversion + # TODO: fix as this is not in use now: records: list[BiosampleRecord] = [] raise NotImplementedError diff --git a/src/P6/measurement.py b/src/P6/measurement.py index 340c331..18dfeaf 100644 --- a/src/P6/measurement.py +++ b/src/P6/measurement.py @@ -6,6 +6,7 @@ from dataclasses import dataclass + @dataclass class MeasurementRecord: """ @@ -18,6 +19,7 @@ class MeasurementRecord: measurement_unit: Unit CURIE or string (e.g. 'mmol/L'). measurement_timestamp: ISO timestamp string (e.g. '2025-07-15T14:23:00'). """ + patient_ID: str measurement_type: str measurement_value: float diff --git a/tests/conftest.py b/tests/conftest.py index bf0fd40..85d1f25 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,6 +24,7 @@ def fpath_hpo(fpath_test_dir: str) -> str: def hpo(fpath_hpo: str) -> hpotk.MinimalOntology: return hpotk.load_minimal_ontology(fpath_hpo) + # use the `hpo` already defined above in this file @pytest.fixture def decompressed_hpo(fpath_hpo: str, tmp_path: Path) -> str: @@ -33,4 +34,4 @@ def decompressed_hpo(fpath_hpo: str, tmp_path: Path) -> str: out = tmp_path / "hp.json" with gzip.open(fpath_hpo, "rb") as f_in, open(out, "wb") as f_out: shutil.copyfileobj(f_in, f_out) - return str(out) \ No newline at end of file + return str(out) diff --git a/tests/test_full_features.py b/tests/test_full_features.py index e518e5c..964cc06 100644 --- a/tests/test_full_features.py +++ b/tests/test_full_features.py @@ -2,22 +2,19 @@ import re import glob import json -import pytest from click.testing import CliRunner from P6.__main__ import main # Path to the “full” example workbook (e.g. created under tests/data/full_example.xlsx) -FULL_XLSX = os.path.join(os.path.dirname(__file__), "data", "Sydney_Python_transformation.xlsx") +FULL_XLSX = os.path.join( + os.path.dirname(__file__), "data", "Sydney_Python_transformation.xlsx" +) + def test_full_features_parse_creates_all_blocks(decompressed_hpo): runner = CliRunner() result = runner.invoke( - main, - [ - "parse-excel", - "-e", FULL_XLSX, - "--custom-hpo", decompressed_hpo - ], + main, ["parse-excel", "-e", FULL_XLSX, "--custom-hpo", decompressed_hpo] ) assert result.exit_code == 0, result.output @@ -32,6 +29,6 @@ def test_full_features_parse_creates_all_blocks(decompressed_hpo): # Load the first phenopacket and check new keys pkt = json.load(open(json_files[0], encoding="utf-8")) - assert isinstance(pkt.get("diseases", []), list), "Missing 'diseases' block" + assert isinstance(pkt.get("diseases", []), list), "Missing 'diseases' block" assert isinstance(pkt.get("measurements", []), list), "Missing 'measurements' block" - assert isinstance(pkt.get("biosamples", []), list), "Missing 'biosamples' block" + assert isinstance(pkt.get("biosamples", []), list), "Missing 'biosamples' block" From 33695a0116104bc5c54a3fae4620d8df2d83f140 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 8 Aug 2025 12:02:27 +0200 Subject: [PATCH 21/36] Remove manual decompression of HPO JSON files - leave that for hpotk --- tests/conftest.py | 24 ++++++++---------------- tests/test_full_features.py | 4 ++-- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 85d1f25..ce620fc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,28 +10,20 @@ @pytest.fixture(scope="session") def fpath_test_dir() -> str: """ - Path to `tests` folder. + Path to `tests/data/` folder. """ - return os.path.dirname(os.path.abspath(__file__)) + return os.path.join(os.path.dirname(__file__), "data") @pytest.fixture(scope="session") def fpath_hpo(fpath_test_dir: str) -> str: - return os.path.join(fpath_test_dir, "data", "hp.v2024-04-26.json.gz") + return os.path.join(fpath_test_dir, "hp.v2024-04-26.json.gz") @pytest.fixture(scope="session") def hpo(fpath_hpo: str) -> hpotk.MinimalOntology: - return hpotk.load_minimal_ontology(fpath_hpo) - - -# use the `hpo` already defined above in this file -@pytest.fixture -def decompressed_hpo(fpath_hpo: str, tmp_path: Path) -> str: - """ - Decompress the gzipped HPO JSON to a plain JSON file and return its path. - """ - out = tmp_path / "hp.json" - with gzip.open(fpath_hpo, "rb") as f_in, open(out, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - return str(out) + ''' + The PATH to a JSON file of HPO terms and IDs. + `hpotk` should be able to read this directly without manual decompression. + ''' + return hpotk.load_minimal_ontology(fpath_hpo) \ No newline at end of file diff --git a/tests/test_full_features.py b/tests/test_full_features.py index 964cc06..66e386f 100644 --- a/tests/test_full_features.py +++ b/tests/test_full_features.py @@ -11,10 +11,10 @@ ) -def test_full_features_parse_creates_all_blocks(decompressed_hpo): +def test_full_features_parse_creates_all_blocks(fpath_hpo): runner = CliRunner() result = runner.invoke( - main, ["parse-excel", "-e", FULL_XLSX, "--custom-hpo", decompressed_hpo] + main, ["parse-excel", "-e", FULL_XLSX, "--custom-hpo", fpath_hpo] ) assert result.exit_code == 0, result.output From bfeb87961dcc3a46a20f0896e9c606d19727451f Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 8 Aug 2025 12:45:26 +0200 Subject: [PATCH 22/36] ruff checking --- tests/conftest.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ce620fc..a2a2c67 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,6 @@ -import gzip import hpotk import os import pytest -import shutil - -from pathlib import Path @pytest.fixture(scope="session") @@ -22,8 +18,8 @@ def fpath_hpo(fpath_test_dir: str) -> str: @pytest.fixture(scope="session") def hpo(fpath_hpo: str) -> hpotk.MinimalOntology: - ''' + """ The PATH to a JSON file of HPO terms and IDs. `hpotk` should be able to read this directly without manual decompression. - ''' - return hpotk.load_minimal_ontology(fpath_hpo) \ No newline at end of file + """ + return hpotk.load_minimal_ontology(fpath_hpo) From c262f18e766610c6a4c46bab409aec978e58a792 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 8 Aug 2025 13:23:27 +0200 Subject: [PATCH 23/36] Start to refactor to have apply_mapping return list[Phenopacket] (preferred design) --- src/P6/mapper.py | 97 +++++++++++++----------------------------------- 1 file changed, 25 insertions(+), 72 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index ebfe51a..2f5c83e 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -90,7 +90,8 @@ class TableMapper(metaclass=abc.ABCMeta): def apply_mapping( self, tables: dict[str, pd.DataFrame], notepad: Notepad ) -> typing.Sequence[Phenopacket]: - pass + # return fully-assembled Phenopacket messages, not intermediate parts. + raise NotImplementedError class DefaultMapper(TableMapper): @@ -103,80 +104,32 @@ def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): self.strict_variants = strict_variants def apply_mapping( - self, tables: dict[str, pd.DataFrame], notepad: Notepad - ) -> tuple[ - list[Genotype], - list[Phenotype], - list[DiseaseRecord], - list[MeasurementRecord], - list[BiosampleRecord], - ]: - """ - 1) classify each sheet as genotype / phenotype / disease / measurement / biosample - 2) call the matching mapper + self, tables: dict[str, pd.DataFrame], notepad: Notepad) -> list[Phenopacket]: """ - # initialize the lists to return - genotype_records: list[Genotype] = [] - phenotype_records: list[Phenotype] = [] - disease_records: list[DiseaseRecord] = [] - measurement_records: list[MeasurementRecord] = [] - biosample_records: list[BiosampleRecord] = [] - - for sheet_name, df in tables.items(): - columns = set(df.columns) - """ 1) classify: does this look like genotype, phenotype, or something to skip? """ - has_raw = RAW_VARIANT_COLUMNS.issubset(columns) - has_hgvs = bool(HGVS_VARIANT_COLUMNS & columns) - """Send each sheet to the right extractor and collect all records.""" - is_genotype_sheet = GENOTYPE_BASE_COLUMNS.issubset(columns) and ( - has_raw or has_hgvs - ) - is_phenotype_sheet = PHENOTYPE_KEY_COLUMNS.issubset(columns) - - if is_genotype_sheet == is_phenotype_sheet: - # if we have both raw & HGVS notations, we need to validate that they match - if has_raw and has_hgvs: - self._check_hgvs_consistency(sheet_name, df, notepad) - # ambiguous sheet should give a warning instead of an error - notepad.add_warning( - f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype" - ) - continue - - # rename the former-index column - working = self._prepare_sheet(df, is_genotype_sheet) - - if is_genotype_sheet: - genotype_records.extend( - self._map_genotype(sheet_name, working, notepad) - ) - else: - phenotype_records.extend( - self._map_phenotype(sheet_name, working, notepad) - ) + Process: + 1) choose/validate input tables + 2) map rows to domain records + 3) group records per patient + 4) construct Phenopacket per patient + 5) return the list of packets - # New Fields - if DISEASE_KEY_COLUMNS.issubset(columns): - disease_records.extend(self._map_disease(sheet_name, working, notepad)) - continue - if MEASUREMENT_KEY_COLUMNS.issubset(columns): - measurement_records.extend( - self._map_measurement(sheet_name, working, notepad) - ) - continue - if BIOSAMPLE_KEY_COLUMNS.issubset(columns): - biosample_records.extend( - self._map_biosample(sheet_name, working, notepad) - ) - continue + """ + # TODO: implement the placeholders I am going to temporarily call + typed_tables = self._choose_named_tables(tables, notepad) + genotype_records = self._map_genotype_table(typed_tables.genotype, notepad) + phenotype_records = self._map_phenotype_table(typed_tables.phenotype, notepad) + disease_records = self._map_diseases_table(typed_tables.diseases, notepad) + measurement_records = self._map_measurements_table(typed_tables.measurements, notepad) + biosample_records = self._map_biosamples_table(typed_tables.biosamples, notepad) + + grouped = self._group_records_by_patient(genotype_records, phenotype_records, disease_records, measurement_records, biosample_records) + + packets: list[Phenopacket] = [ + self.construct_phenopacket_for_patient(patient_id, bundle, notepad) + for patient_id, bundle in grouped.items() + ] + return packets - return ( - genotype_records, - phenotype_records, - disease_records, - measurement_records, - biosample_records, - ) def _check_hgvs_consistency( self, sheet_name: str, df: pd.DataFrame, notepad: Notepad From b8ff573dbc8820bf5a18277aaa7af22dac874e6c Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 8 Aug 2025 13:33:51 +0200 Subject: [PATCH 24/36] ruff format --- src/P6/mapper.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index 2f5c83e..8f11b4b 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -104,7 +104,8 @@ def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): self.strict_variants = strict_variants def apply_mapping( - self, tables: dict[str, pd.DataFrame], notepad: Notepad) -> list[Phenopacket]: + self, tables: dict[str, pd.DataFrame], notepad: Notepad + ) -> list[Phenopacket]: """ Process: 1) choose/validate input tables @@ -119,10 +120,18 @@ def apply_mapping( genotype_records = self._map_genotype_table(typed_tables.genotype, notepad) phenotype_records = self._map_phenotype_table(typed_tables.phenotype, notepad) disease_records = self._map_diseases_table(typed_tables.diseases, notepad) - measurement_records = self._map_measurements_table(typed_tables.measurements, notepad) + measurement_records = self._map_measurements_table( + typed_tables.measurements, notepad + ) biosample_records = self._map_biosamples_table(typed_tables.biosamples, notepad) - grouped = self._group_records_by_patient(genotype_records, phenotype_records, disease_records, measurement_records, biosample_records) + grouped = self._group_records_by_patient( + genotype_records, + phenotype_records, + disease_records, + measurement_records, + biosample_records, + ) packets: list[Phenopacket] = [ self.construct_phenopacket_for_patient(patient_id, bundle, notepad) @@ -130,7 +139,6 @@ def apply_mapping( ] return packets - def _check_hgvs_consistency( self, sheet_name: str, df: pd.DataFrame, notepad: Notepad ) -> None: From 4a764e2e6e3102d1a8f0bef444dcaaf98a6d87d5 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 8 Aug 2025 13:42:21 +0200 Subject: [PATCH 25/36] Continue to refactor to have apply_mapping return list[Phenopacket] (preferred design) --- src/P6/__main__.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/P6/__main__.py b/src/P6/__main__.py index 774a973..ddd5030 100644 --- a/src/P6/__main__.py +++ b/src/P6/__main__.py @@ -195,13 +195,15 @@ def parse_excel( # 4) Apply mapping to get raw records and collect issues notepad = create_notepad("phenopackets") - ( - genotype_records, - phenotype_records, - disease_records, - measurement_records, - biosample_records, - ) = mapper.apply_mapping(tables, notepad) + phenopackets = mapper.apply_mapping(tables, notepad) + output_dir = _prepare_output_dir() + count = 0 + for pkt in phenopackets: + output_path = output_dir / f"{count + 1}.json" + with open(output_path, "w", encoding="utf-8") as out_f: + out_f.write(MessageToJson(pkt)) + count += 1 + click.echo(f"Wrote {count} phenopacket files to {output_dir}") # TODO: Come back and add more top-level fields # 5) Report any errors or warnings From 95e861f014d1e784dbfa25da732f75a31fbaf888 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 8 Aug 2025 13:46:37 +0200 Subject: [PATCH 26/36] Clarify input by standardized sheet names via TypedTables --- src/P6/mapper.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index 8f11b4b..ae7dc51 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -12,6 +12,8 @@ import re import typing +from collections import defaultdict +from dataclasses import dataclass from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket from stairval.notepad import Notepad @@ -84,6 +86,45 @@ # minimal base columns to call something a genotype sheet (we bring the index in later) GENOTYPE_BASE_COLUMNS = {"contact_email", "phasing"} +# Friendly aliases → reduces friction while keeping behavior explicit +KNOWN_SHEET_ALIASES: dict[str, set[str]] = {"genotype": {"genotype", "variants", "variant", "geno"}, "phenotype": {"phenotype", "hpo", "pheno"}, "diseases": {"disease", "diseases"}, "measurements": {"measurement", "measurements", "labs"}, "biosamples": {"biosample", "biosamples", "samples"}} + +@dataclass +class TypedTables: + """ + Explicit, typed access to workbook sheets. + Any field can be `None`, meaning that the sheet not provided. + """ + genotype: pd.DataFrame | None + phenotype: pd.DataFrame | None + diseases: pd.DataFrame | None + measurements: pd.DataFrame | None + biosamples: pd.DataFrame | None + +def _choose_named_tables(self, tables: dict[str, pd.DataFrame], notepad: Notepad) -> TypedTables: + """ + Prefer explicit sheet names (plus common aliases). + """ + def by_alias(kind: str) -> pd.DataFrame | None: + aliases = KNOWN_SHEET_ALIASES[kind] + for sheet_name, df in tables.items(): + if sheet_name.strip().casefold() in aliases: + return df + return None + + selected = TypedTables( + genotype=by_alias("genotype"), + phenotype=by_alias("phenotype"), + diseases=by_alias("diseases"), + measurements=by_alias("measurements"), + biosamples=by_alias("biosamples"), + ) + + # Hard-minimum: at least genotype or phenotype must exist + if selected.genotype is None and selected.phenotype is None: + notepad.add_error("Missing required sheet: either 'genotype' or 'phenotype'.") + + return selected class TableMapper(metaclass=abc.ABCMeta): @abc.abstractmethod From ca759ae1b975b89da3aecbccd277fe4f18a097e9 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 8 Aug 2025 15:37:27 +0200 Subject: [PATCH 27/36] Attempt to split apply_mapping into small row-by-row helpers --- src/P6/mapper.py | 413 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 329 insertions(+), 84 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index ae7dc51..4fff14d 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -101,31 +101,6 @@ class TypedTables: measurements: pd.DataFrame | None biosamples: pd.DataFrame | None -def _choose_named_tables(self, tables: dict[str, pd.DataFrame], notepad: Notepad) -> TypedTables: - """ - Prefer explicit sheet names (plus common aliases). - """ - def by_alias(kind: str) -> pd.DataFrame | None: - aliases = KNOWN_SHEET_ALIASES[kind] - for sheet_name, df in tables.items(): - if sheet_name.strip().casefold() in aliases: - return df - return None - - selected = TypedTables( - genotype=by_alias("genotype"), - phenotype=by_alias("phenotype"), - diseases=by_alias("diseases"), - measurements=by_alias("measurements"), - biosamples=by_alias("biosamples"), - ) - - # Hard-minimum: at least genotype or phenotype must exist - if selected.genotype is None and selected.phenotype is None: - notepad.add_error("Missing required sheet: either 'genotype' or 'phenotype'.") - - return selected - class TableMapper(metaclass=abc.ABCMeta): @abc.abstractmethod def apply_mapping( @@ -134,7 +109,6 @@ def apply_mapping( # return fully-assembled Phenopacket messages, not intermediate parts. raise NotImplementedError - class DefaultMapper(TableMapper): def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): """ @@ -157,13 +131,13 @@ def apply_mapping( """ # TODO: implement the placeholders I am going to temporarily call + # Map each selected sheet to domain-specific records via the table-level wrappers + # The wrappers handle index→patient id normalization and any sheet-level checks then delegate to the row mappers. typed_tables = self._choose_named_tables(tables, notepad) genotype_records = self._map_genotype_table(typed_tables.genotype, notepad) phenotype_records = self._map_phenotype_table(typed_tables.phenotype, notepad) disease_records = self._map_diseases_table(typed_tables.diseases, notepad) - measurement_records = self._map_measurements_table( - typed_tables.measurements, notepad - ) + measurement_records = self._map_measurements_table(typed_tables.measurements, notepad) biosample_records = self._map_biosamples_table(typed_tables.biosamples, notepad) grouped = self._group_records_by_patient( @@ -180,51 +154,6 @@ def apply_mapping( ] return packets - def _check_hgvs_consistency( - self, sheet_name: str, df: pd.DataFrame, notepad: Notepad - ) -> None: - """ - If both raw coordinates and HGVS notation are present, ensure that the genotype notations match - """ - - pattern = re.compile( - r"^(?:chr)?(?P[^:]+):g\.(?P\d+)" - r"(?P[ACGT]+)>(?P[ACGT]+)$", - re.IGNORECASE, - ) - - for idx, row in df.iterrows(): - hgvs = str(row.get("hgvsg", "")).strip() - m = pattern.match(hgvs) - if not m: - # always treat malformed HGVS as an error - notepad.add_error( - f"Sheet {sheet_name!r}, row {idx}: malformed HGVS g. notation {hgvs!r}" - ) - continue - chromosome_name = m.group("chromosome_name") - mutation_position = int(m.group("mutation_position")) - reference_allele = m.group("reference_allele") - alternative_allele = m.group("alternative_allele") - - # compare against raw columns - mismatch_msg = ( - f"Sheet {sheet_name!r}, row {idx}: HGVS '{hgvs}' disagrees with " - f"raw ({row['chromosome']}:{row['start_position']}-" - f"{row['end_position']} {row['reference']}>{row['alternate']})" - ) - if ( - str(row["chromosome"]) != chromosome_name - or int(row["start_position"]) != mutation_position - or int(row["end_position"]) != mutation_position - or str(row["reference"]) != reference_allele - or str(row["alternate"]) != alternative_allele - ): - if self.strict_variants: - notepad.add_error(mismatch_msg) - else: - notepad.add_warning(mismatch_msg) - def _prepare_sheet(self, df: pd.DataFrame, is_genotype: bool) -> pd.DataFrame: """Bring the index into a column and name it appropriately.""" column_id = "genotype_patient_ID" if is_genotype else "phenotype_patient_ID" @@ -385,30 +314,346 @@ def _map_phenotype( return records - def _map_disease( + def _check_hgvs_consistency( self, sheet_name: str, df: pd.DataFrame, notepad: Notepad + ) -> None: + """ + If both raw coordinates and HGVS notation are present, ensure that the genotype notations match + """ + + pattern = re.compile( + r"^(?:chr)?(?P[^:]+):g\.(?P\d+)" + r"(?P[ACGT]+)>(?P[ACGT]+)$", + re.IGNORECASE, + ) + + for idx, row in df.iterrows(): + hgvs = str(row.get("hgvsg", "")).strip() + m = pattern.match(hgvs) + if not m: + # always treat malformed HGVS as an error + notepad.add_error( + f"Sheet {sheet_name!r}, row {idx}: malformed HGVS g. notation {hgvs!r}" + ) + continue + chromosome_name = m.group("chromosome_name") + mutation_position = int(m.group("mutation_position")) + reference_allele = m.group("reference_allele") + alternative_allele = m.group("alternative_allele") + + # compare against raw columns + mismatch_msg = ( + f"Sheet {sheet_name!r}, row {idx}: HGVS '{hgvs}' disagrees with " + f"raw ({row['chromosome']}:{row['start_position']}-" + f"{row['end_position']} {row['reference']}>{row['alternate']})" + ) + if ( + str(row["chromosome"]) != chromosome_name + or int(row["start_position"]) != mutation_position + or int(row["end_position"]) != mutation_position + or str(row["reference"]) != reference_allele + or str(row["alternate"]) != alternative_allele + ): + if self.strict_variants: + notepad.add_error(mismatch_msg) + else: + notepad.add_warning(mismatch_msg) + + def _prepare_sheet_for_patient(self, df: pd.DataFrame, patient_id_column: str) -> pd.DataFrame: + """ + Similar to _prepare_sheet, but used for sheets whose patient identifier column is named 'patient_ID' (diseases, measurements, biosamples). + This brings the current index into a named column so downstream mappers can access a consistent patient identifier. + """ + working = df.reset_index() + original = working.columns[0] + return working.rename(columns={original: patient_id_column}) + + def _choose_named_tables(self, tables: dict[str, pd.DataFrame], notepad: Notepad) -> TypedTables: + """ + Prefer explicit sheet names (plus common aliases). + """ + + def by_alias(kind: str) -> pd.DataFrame | None: + aliases = KNOWN_SHEET_ALIASES[kind] + for sheet_name, df in tables.items(): + if sheet_name.strip().casefold() in aliases: + return df + return None + + selected = TypedTables( + genotype=by_alias("genotype"), + phenotype=by_alias("phenotype"), + diseases=by_alias("diseases"), + measurements=by_alias("measurements"), + biosamples=by_alias("biosamples"), + ) + + # Hard-minimum: at least genotype or phenotype must exist + if selected.genotype is None and selected.phenotype is None: + notepad.add_error("Missing required sheet: either 'genotype' or 'phenotype'.") + + return selected + + # Table-level wrapper mappers + def _map_genotype_table(self, df: pd.DataFrame | None, notepad: Notepad) -> list[Genotype]: + """ + Sheet-level wrapper for Genotype rows: + - normalize index to 'genotype_patient_ID' + - optionally check HGVS vs raw coordinate consistency when both are present + - delegate row conversion to _map_genotype + """ + if df is None: + return [] + working = self._prepare_sheet(df, is_genotype=True) + columns_present = set(working.columns) + if RAW_VARIANT_COLUMNS.issubset(columns_present) and (HGVS_VARIANT_COLUMNS & columns_present): + self._check_hgvs_consistency("genotype", working, notepad) + return self._map_genotype("genotype", working, notepad) + + def _map_phenotype_table(self, df: pd.DataFrame | None, notepad: Notepad) -> list[Phenotype]: + """ + Sheet-level wrapper for Phenotype rows: + - normalize index to 'phenotype_patient_ID' + - delegate row conversion to _map_phenotype + """ + if df is None: + return [] + working = self._prepare_sheet(df, is_genotype=False) + return self._map_phenotype("phenotype", working, notepad) + + def _map_diseases_table(self, df: pd.DataFrame | None, notepad: Notepad) -> list[DiseaseRecord]: + """ + Sheet-level wrapper for Disease rows: + - normalize index to 'patient_ID' + - delegate row conversion to _map_disease + """ + if df is None: + return [] + working = self._prepare_sheet_for_patient(df, "patient_ID") + return self._map_disease("diseases", working, notepad) + + def _map_measurements_table(self, df: pd.DataFrame | None, notepad: Notepad) -> list[MeasurementRecord]: + """ + Sheet-level wrapper for Measurement rows: + - normalize index to 'patient_ID' + - delegate row conversion to _map_measurement + """ + if df is None: + return [] + working = self._prepare_sheet_for_patient(df, "patient_ID") + return self._map_measurement("measurements", working, notepad) + + def _map_biosamples_table(self, df: pd.DataFrame | None, notepad: Notepad) -> list[BiosampleRecord]: + """ + Sheet-level wrapper for Biosample rows: + - normalize index to 'patient_ID' + - delegate row conversion to _map_biosample + """ + if df is None: + return [] + working = self._prepare_sheet_for_patient(df, "patient_ID") + return self._map_biosample("biosamples", working, notepad) + + def _map_disease( + self, sheet_name: str, df: pd.DataFrame, notepad: Notepad ) -> list[DiseaseRecord]: - # TODO: implement row→DiseaseRecord, row→MeasurementRecord conversion, and row→BiosampleRecord conversions """ Map each row in a disease sheet to a DiseaseRecord. + Required columns: patient_ID, disease_term, disease_onset, disease_status. + Optional column: disease_label. """ - # TODO: fix as this is not in use now: records: list[DiseaseRecord] = [] - raise NotImplementedError + records: list[DiseaseRecord] = [] + required_columns = {"patient_ID", "disease_term", "disease_onset", "disease_status"} + missing = required_columns - set(df.columns) + if missing: + notepad.add_error(f"Sheet {sheet_name!r}: missing required columns: {sorted(missing)}") + return records + + for index, row in df.iterrows(): + try: + disease_record = DiseaseRecord( + patient_ID=str(row["patient_ID"]), + disease_term=str(row["disease_term"]).strip(), + disease_label=(str(row.get("disease_label", "")).strip() or None), + disease_onset=str(row["disease_onset"]).strip(), + disease_status=bool(row["disease_status"]), + ) + records.append(disease_record) + except (ValueError, TypeError) as exception: + notepad.add_error(f"Sheet {sheet_name!r}, row {index}: {exception}") + return records def _map_measurement( - self, sheet_name: str, df: pd.DataFrame, notepad: Notepad + self, sheet_name: str, df: pd.DataFrame, notepad: Notepad ) -> list[MeasurementRecord]: """ Map each row in a measurement sheet to a MeasurementRecord. + Required columns: patient_ID, measurement_type, measurement_value, measurement_unit. + Optional column: measurement_timestamp (numeric values are prefixed with 'T' for consistency). """ - # TODO: fix as this is not in use now: records: list[MeasurementRecord] = [] - raise NotImplementedError + records: list[MeasurementRecord] = [] + required_columns = {"patient_ID", "measurement_type", "measurement_value", "measurement_unit"} + missing = required_columns - set(df.columns) + if missing: + notepad.add_error(f"Sheet {sheet_name!r}: missing required columns: {sorted(missing)}") + return records + + for index, row in df.iterrows(): + try: + raw_timestamp = row.get("measurement_timestamp", "") + if isinstance(raw_timestamp, (int, float)) and not str(raw_timestamp).startswith("T"): + measurement_timestamp = f"T{int(raw_timestamp)}" + else: + measurement_timestamp = str(raw_timestamp).strip() if pd.notna(raw_timestamp) else None + + measurement_record = MeasurementRecord( + patient_ID=str(row["patient_ID"]), + measurement_type=str(row["measurement_type"]).strip(), + measurement_value=float(row["measurement_value"]), + measurement_unit=str(row["measurement_unit"]).strip(), + measurement_timestamp=measurement_timestamp, + ) + records.append(measurement_record) + except (ValueError, TypeError) as exception: + notepad.add_error(f"Sheet {sheet_name!r}, row {index}: {exception}") + return records def _map_biosample( - self, sheet_name: str, df: pd.DataFrame, notepad: Notepad + self, sheet_name: str, df: pd.DataFrame, notepad: Notepad ) -> list[BiosampleRecord]: """ Map each row in a biosample sheet to a BiosampleRecord. + Required columns: patient_ID, biosample_id, biosample_type, collection_date. + Numeric collection_date values are prefixed with 'T' for consistency with phenotype dates. """ - # TODO: fix as this is not in use now: records: list[BiosampleRecord] = [] - raise NotImplementedError + records: list[BiosampleRecord] = [] + required_columns = {"patient_ID", "biosample_id", "biosample_type", "collection_date"} + missing = required_columns - set(df.columns) + if missing: + notepad.add_error(f"Sheet {sheet_name!r}: missing required columns: {sorted(missing)}") + return records + + for index, row in df.iterrows(): + try: + raw_collection_date = row["collection_date"] + if isinstance(raw_collection_date, (int, float)) and not str(raw_collection_date).startswith("T"): + collection_date = f"T{int(raw_collection_date)}" + else: + collection_date = str(raw_collection_date).strip() + + biosample_record = BiosampleRecord( + patient_ID=str(row["patient_ID"]), + biosample_id=str(row["biosample_id"]).strip(), + biosample_type=str(row["biosample_type"]).strip(), + collection_date=collection_date, + ) + records.append(biosample_record) + except (ValueError, TypeError) as exception: + notepad.add_error(f"Sheet {sheet_name!r}, row {index}: {exception}") + return records + + # Grouping and phenopacket construction + def _group_records_by_patient(self, genotype_records: list[Genotype], phenotype_records: list[Phenotype], disease_records: list[DiseaseRecord], measurement_records: list[MeasurementRecord], biosample_records: list[BiosampleRecord], ) -> dict[str, dict[str, list]]: + """ + Group all domain records by patient identifier, producing a bundle per patient + """ + grouped = defaultdict( + lambda: { + "genotype_records": [], + "phenotype_records": [], + "disease_records": [], + "measurement_records": [], + "biosample_records": [], + } + ) + for genotype in genotype_records: + grouped[genotype.genotype_patient_ID]["genotype_records"].append(genotype) + for phenotype in phenotype_records: + grouped[phenotype.phenotype_patient_ID]["phenotype_records"].append(phenotype) + for disease in disease_records: + grouped[disease.patient_ID]["disease_records"].append(disease) + for measurement in measurement_records: + grouped[measurement.patient_ID]["measurement_records"].append(measurement) + for biosample in biosample_records: + grouped[biosample.patient_ID]["biosample_records"].append(biosample) + return grouped + + def construct_phenopacket_for_patient(self, patient_id: str, bundle: dict[str, list], notepad: Notepad) -> Phenopacket: + """ + Build a Phenopacket for a single patient using their grouped records. + Field assignments follow the explicit naming and serialization style. + """ + from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket + import phenopackets.schema.v2 as pps2 + + phenopacket = Phenopacket() + phenopacket.id = patient_id + phenopacket.subject.id = patient_id + + # Phenotypic features + for phenotype in bundle.get("phenotype_records", []): + feature = phenopacket.phenotypic_features.add() + feature.type.id = phenotype.HPO_ID + if not phenotype.status: + feature.excluded = True + + # Genotype interpretations (minimal HGVS expression to start) + for interpretation_index, genotype_record in enumerate(bundle.get("genotype_records", [])): + interpretation = phenopacket.interpretations.add() + interpretation.id = f"{patient_id}-interpretation-{interpretation_index}" + interpretation.progress_status = interpretation.ProgressStatus.COMPLETED + + genomic_interpretation_entry = interpretation.diagnosis.genomic_interpretations.add() + genomic_interpretation_entry.subject_or_biosample_id = patient_id + genomic_interpretation_entry.interpretation_status = ( + genomic_interpretation_entry.InterpretationStatus.CONTRIBUTORY + ) + + variant_interpretation = genomic_interpretation_entry.variant_interpretation + variation_descriptor = variant_interpretation.variation_descriptor + + # Add HGVS expression; set syntax enum when available + expression = variation_descriptor.expressions.add() + try: + expression.syntax = pps2.VariationDescriptor.Expression.HGVS + except AttributeError: + pass + expression.value = genotype_record.hgvsg or "" + + # Optional: attempt to set a subset of location/alleles if supported + try: + location_context = variation_descriptor.location + location_context.interval.interval_type = ( + pps2.VariationDescriptor.Location.Interval.Type.EXACT + ) + location_context.interval.start = genotype_record.start_position + location_context.interval.end = genotype_record.end_position + location_context.reference_sequence_id = genotype_record.chromosome + variation_descriptor.reference = genotype_record.reference + variation_descriptor.alternate = genotype_record.alternate + except AttributeError: + # Some library builds do not expose these submessages; try to skip gracefully if we cannot implement this feature. + pass + + # Optional sections (diseases, measurements, biosamples). + # Keep assignments minimal and consistent with the earlier CLI code. + for disease_record in bundle.get("disease_records", []): + disease_message = phenopacket.diseases.add() + disease_message.term.id = disease_record.disease_term + if getattr(disease_record, "disease_label", None): + disease_message.term.label = disease_record.disease_label + # Onset/status message wiring can be expanded later as needed. + + for measurement_record in bundle.get("measurement_records", []): + measurement_message = phenopacket.measurements.add() + measurement_message.type.id = measurement_record.measurement_type + # Depending on the installed protobuf version, value/unit/timestamp may require message types; + # keep this minimal for now (extend down the line). + + for biosample_record in bundle.get("biosample_records", []): + biosample_message = phenopacket.biosamples.add() + biosample_message.id = biosample_record.biosample_id + biosample_message.type.id = biosample_record.biosample_type + + return phenopacket + From e37930fcd2025757871b39b804b44610129f8659 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 8 Aug 2025 18:05:03 +0200 Subject: [PATCH 28/36] Split into a row-level helper and call it in the table wrapper --- src/P6/mapper.py | 62 +++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index 4fff14d..421fd3c 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -16,6 +16,7 @@ from dataclasses import dataclass from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket from stairval.notepad import Notepad +from typing import Sequence, Dict, List from .biosample import BiosampleRecord from .disease import DiseaseRecord @@ -314,50 +315,30 @@ def _map_phenotype( return records - def _check_hgvs_consistency( - self, sheet_name: str, df: pd.DataFrame, notepad: Notepad - ) -> None: - """ - If both raw coordinates and HGVS notation are present, ensure that the genotype notations match - """ - + def check_hgvs_consistency(item: pd.Series, sheet_name: str, notepad: Notepad, strict: bool) -> None: pattern = re.compile( r"^(?:chr)?(?P[^:]+):g\.(?P\d+)" r"(?P[ACGT]+)>(?P[ACGT]+)$", re.IGNORECASE, ) - - for idx, row in df.iterrows(): - hgvs = str(row.get("hgvsg", "")).strip() - m = pattern.match(hgvs) - if not m: - # always treat malformed HGVS as an error - notepad.add_error( - f"Sheet {sheet_name!r}, row {idx}: malformed HGVS g. notation {hgvs!r}" - ) - continue - chromosome_name = m.group("chromosome_name") - mutation_position = int(m.group("mutation_position")) - reference_allele = m.group("reference_allele") - alternative_allele = m.group("alternative_allele") - - # compare against raw columns - mismatch_msg = ( - f"Sheet {sheet_name!r}, row {idx}: HGVS '{hgvs}' disagrees with " - f"raw ({row['chromosome']}:{row['start_position']}-" - f"{row['end_position']} {row['reference']}>{row['alternate']})" - ) - if ( - str(row["chromosome"]) != chromosome_name - or int(row["start_position"]) != mutation_position - or int(row["end_position"]) != mutation_position - or str(row["reference"]) != reference_allele - or str(row["alternate"]) != alternative_allele - ): - if self.strict_variants: - notepad.add_error(mismatch_msg) - else: - notepad.add_warning(mismatch_msg) + hgvs = str(item.get("hgvsg", "")).strip() + m = pattern.match(hgvs) + if not m: + notepad.add_error(f"Sheet {sheet_name!r}: malformed HGVS g. notation {hgvs!r}") + return + + mismatch = ( + str(item["chromosome"]) != m.group("chromosome_name") or + int(item["start_position"]) != int(m.group("mutation_position")) or + int(item["end_position"]) != int(m.group("mutation_position")) or + str(item["reference"]) != m.group("reference_allele") or + str(item["alternate"]) != m.group("alternative_allele") + ) + if mismatch: + msg = (f"Sheet {sheet_name!r}: HGVS '{hgvs}' disagrees with " + f"raw ({item['chromosome']}:{item['start_position']}-" + f"{item['end_position']} {item['reference']}>{item['alternate']})") + (notepad.add_error if strict else notepad.add_warning)(msg) def _prepare_sheet_for_patient(self, df: pd.DataFrame, patient_id_column: str) -> pd.DataFrame: """ @@ -407,7 +388,8 @@ def _map_genotype_table(self, df: pd.DataFrame | None, notepad: Notepad) -> list working = self._prepare_sheet(df, is_genotype=True) columns_present = set(working.columns) if RAW_VARIANT_COLUMNS.issubset(columns_present) and (HGVS_VARIANT_COLUMNS & columns_present): - self._check_hgvs_consistency("genotype", working, notepad) + for _, row in working.iterrows(): + check_hgvs_consistency(row, "genotype", notepad, self.strict_variants) return self._map_genotype("genotype", working, notepad) def _map_phenotype_table(self, df: pd.DataFrame | None, notepad: Notepad) -> list[Phenotype]: From ef2a70bbfdb058167cf997cac68f850285957188 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 11 Aug 2025 11:08:22 +0200 Subject: [PATCH 29/36] refactor(mapper): attempt to restructure DefaultMapper into modular, testable components and align interface - **Align with `TableMapper` interface** - `apply_mapping` now returns `list[Phenopacket]` (previously returned tuples of record lists) - Pipeline steps: 1. Select and validate tables via `_choose_named_tables` 2. Map rows to domain records using `_map_*_table` wrappers 3. Group records per patient 4. Build one `Phenopacket` per patient 5. Return the list of packets - **Explicit table selection** - Added `KNOWN_SHEET_ALIASES` for common name variants: - `genotype`: {"genotype", "variants", "variant", "geno"} - `phenotype`: {"phenotype", "hpo", "pheno"} - `diseases`: {"disease", "diseases"} - `measurements`: {"measurement", "measurements", "labs"} - `biosamples`: {"biosample", "biosamples", "samples"} - Removed column-based guessing; require explicit/aliased names - Emit error if both genotype and phenotype sheets are missing - **Row-level parsers** - `parse_genotype_row`: - Splits slash-separated zygosity/inheritance - Validates against `ZYGOSITY_MAP` / `INHERITANCE_MAP` - Normalizes email and parses numeric positions - Returns `(genotype_records, [])` - `parse_phenotype_row`: - Extracts HPO ID + optional label - Normalizes `date_of_observation` - Uses `_to_bool` for `status` (fixes tuple bug from `bool(row["status"],)`) - Runs ontology lookups and warns on obsolete/mismatched labels - Returns `(phenotype_records, term_ids)` - **Validation utilities** - `_normalize_time_like`: handles `NaN`, `NaT`, blanks; adds `T` prefix to numeric dates - `_to_bool`: robust bool parsing from `1/0`, `yes/no`, `true/false`, etc. - `check_hgvs_consistency`: compares HGVS string to raw coordinate/allele fields; normalizes case and `chr` prefix; warns/errors based on `strict_variants` - **Sheet-level wrappers** - `_map_genotype_table`: normalizes index, checks required `GENOTYPE_KEY_COLUMNS`, runs HGVS consistency check, delegates to `_map_genotype` - `_map_phenotype_table`: checks `PHENOTYPE_KEY_COLUMNS`, delegates to `_map_phenotype` with batch HPO validators - `_map_diseases_table`, `_map_measurements_table`, `_map_biosamples_table`: normalize index, check required columns, delegate to row mappers - **Phenopacket assembly** - `_group_records_by_patient`: aggregates all record types by patient ID - `construct_phenopacket_for_patient`: - Sets subject id - Adds phenotypic features (sets `excluded` if status is false) - Adds genotype interpretations with HGVS expression and optional location/allele fields - Adds diseases, measurements, and biosamples minimally - **Fixes** - Corrected phenotype `status` parsing to `_to_bool` (prevents always-true bug) - Centralized column existence checks to fail fast on missing required fields - **Non-functional** - Improved docstrings and inline comments for clarity - Left optional raw/HGVS fallback logic commented for possible future re-enablement --- src/P6/mapper.py | 407 ++++++++++++++++++++++++++++------------------- 1 file changed, 246 insertions(+), 161 deletions(-) diff --git a/src/P6/mapper.py b/src/P6/mapper.py index 421fd3c..6f700d9 100644 --- a/src/P6/mapper.py +++ b/src/P6/mapper.py @@ -16,7 +16,12 @@ from dataclasses import dataclass from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket from stairval.notepad import Notepad -from typing import Sequence, Dict, List +from typing import Sequence, Dict, List, TypeVar, Tuple + +T = TypeVar("T") +RowParseResult = Tuple[List[T], List[hpotk.TermId]] +# gives us one consistent return shape: (parsed_items, aux_ids_for_batch_validation) + from .biosample import BiosampleRecord from .disease import DiseaseRecord @@ -24,7 +29,6 @@ from .measurement import MeasurementRecord from .phenotype import Phenotype - # For any renamed field, the two neighbors it must sit between EXPECTED_COLUMN_NEIGHBORS = { "start_position": ("chromosome", "end_position"), @@ -58,7 +62,6 @@ MEASUREMENT_KEY_COLUMNS = {"measurement_type", "measurement_value", "measurement_unit"} BIOSAMPLE_KEY_COLUMNS = {"biosample_id", "biosample_type", "collection_date"} - # Map raw zygosity abbreviations to allowed dataclass zygosity values ZYGOSITY_MAP = { "het": "heterozygous", @@ -88,7 +91,12 @@ GENOTYPE_BASE_COLUMNS = {"contact_email", "phasing"} # Friendly aliases → reduces friction while keeping behavior explicit -KNOWN_SHEET_ALIASES: dict[str, set[str]] = {"genotype": {"genotype", "variants", "variant", "geno"}, "phenotype": {"phenotype", "hpo", "pheno"}, "diseases": {"disease", "diseases"}, "measurements": {"measurement", "measurements", "labs"}, "biosamples": {"biosample", "biosamples", "samples"}} +KNOWN_SHEET_ALIASES: dict[str, set[str]] = {"genotype": {"genotype", "variants", "variant", "geno"}, + "phenotype": {"phenotype", "hpo", "pheno"}, + "diseases": {"disease", "diseases"}, + "measurements": {"measurement", "measurements", "labs"}, + "biosamples": {"biosample", "biosamples", "samples"}} + @dataclass class TypedTables: @@ -102,14 +110,16 @@ class TypedTables: measurements: pd.DataFrame | None biosamples: pd.DataFrame | None + class TableMapper(metaclass=abc.ABCMeta): @abc.abstractmethod def apply_mapping( - self, tables: dict[str, pd.DataFrame], notepad: Notepad + self, tables: dict[str, pd.DataFrame], notepad: Notepad ) -> typing.Sequence[Phenopacket]: # return fully-assembled Phenopacket messages, not intermediate parts. raise NotImplementedError + class DefaultMapper(TableMapper): def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): """ @@ -120,7 +130,7 @@ def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False): self.strict_variants = strict_variants def apply_mapping( - self, tables: dict[str, pd.DataFrame], notepad: Notepad + self, tables: dict[str, pd.DataFrame], notepad: Notepad ) -> list[Phenopacket]: """ Process: @@ -162,142 +172,186 @@ def _prepare_sheet(self, df: pd.DataFrame, is_genotype: bool) -> pd.DataFrame: original = working.columns[0] return working.rename(columns={original: column_id}) - def _map_genotype( - self, sheet_name: str, df: pd.DataFrame, notepad: Notepad - ) -> list[Genotype]: - records: list[Genotype] = [] - for idx, row in df.iterrows(): - # handle slash‑separated zygosity and inheritance - list_of_zygosity_types = [ - z.strip().lower() for z in str(row["zygosity"]).split("/") - ] - list_of_inheritance_types = [ - i.strip().lower() for i in str(row["inheritance"]).split("/") - ] - for zygosity_type, inheritance_type in zip( - list_of_zygosity_types, list_of_inheritance_types - ): - if zygosity_type not in ZYGOSITY_MAP: - notepad.add_error( - f"Sheet {sheet_name!r}: Unrecognized zygosity code {zygosity_type!r}" - ) - if inheritance_type not in INHERITANCE_MAP: - notepad.add_error( - f"Sheet {sheet_name!r}: Unrecognized inheritance code {inheritance_type!r}" + @staticmethod + def _normalize_time_like(value: typing.Any) -> str: + """ + Phenotype/measurement/biosample timestamps: + - numeric values are prefixed with 'T' (e.g., 20200101 -> 'T20200101') + - strings are trimmed; if not already prefixed with 'T', we add it + - empty/NaN -> empty string + """ + # Handle None, NaN, NaT, pandas NA, and empty/whitespace-only strings + if value is None or pd.isna(value) or (isinstance(value, str) and not value.strip()): + return "" + if isinstance(value, (int, float)) and not isinstance(value, bool): + return f"T{int(value)}" + s = str(value).strip() + if not s: + return "" + return s if s.upper().startswith("T") else f"T{s}" + + @staticmethod + def _to_bool(value: typing.Any) -> bool: + """ + Robust boolean parsing: + - True for: 1, '1', 'true', 't', 'yes', 'y' (case-insensitive) + - False for: 0, '0', 'false', 'f', 'no', 'n', '', None + - Fallback: Python truthiness on other values (rare) + """ + if isinstance(value, bool): + return value + if value is None: + return False + s = str(value).strip().lower() + if s in {"1", "true", "t", "yes", "y"}: + return True + if s in {"0", "false", "f", "no", "n", ""}: + return False + return bool(value) + + @staticmethod + def parse_genotype_row(row: pd.Series, sheet_name: str, notepad: Notepad) -> RowParseResult[Genotype]: + """ + Parse a single genotype row into zero or more Genotype dataclass instances. + Returns ([], []) if validation fails for this row. + """ + genotypes: list[Genotype] = [] + + # handle slash-separated zygosity and inheritance + list_of_zygosity_types = [zygosity_entry.strip().lower() for zygosity_entry in + str(row.get("zygosity", "")).split("/")] + list_of_inheritance_types = [inheritance_entry.strip().lower() for inheritance_entry in + str(row.get("inheritance", "")).split("/")] + + # zip will truncate to the shorter of the two, matching the previous behavior + for zygosity_type, inheritance_type in zip(list_of_zygosity_types, list_of_inheritance_types): + if zygosity_type not in ZYGOSITY_MAP: + notepad.add_error(f"Sheet {sheet_name!r}: Unrecognized zygosity code {zygosity_type!r}") + return [], [] # bail on this row + if inheritance_type not in INHERITANCE_MAP: + notepad.add_error(f"Sheet {sheet_name!r}: Unrecognized inheritance code {inheritance_type!r}") + return [], [] # bail on this row + + # allow missing/NaN contact_email → substitute dummy + raw_email = row.get("contact_email") + contact_email = ("unknown@example.com" if pd.isna(raw_email) else str(raw_email).strip()) + + try: + genotypes.append( + Genotype( + genotype_patient_ID=str(row["genotype_patient_ID"]), + contact_email=contact_email, + phasing=DefaultMapper._to_bool(row.get("phasing")), + chromosome=str(row["chromosome"]), + start_position=int(row["start_position"]), + end_position=int(row["end_position"]), + reference=str(row["reference"]), + alternate=str(row["alternate"]), + gene_symbol=str(row["gene_symbol"]), + hgvsg=str(row["hgvsg"]), + hgvsc=str(row["hgvsc"]), + hgvsp=str(row["hgvsp"]), + zygosity=ZYGOSITY_MAP[zygosity_type], + inheritance=INHERITANCE_MAP[inheritance_type] ) - # allow missing/NaN contact_email → substitute dummy - raw_email = row["contact_email"] - contact_email = ( - "unknown@example.com" - if pd.isna(raw_email) - else str(raw_email).strip() ) - kwargs = { - "genotype_patient_ID": str(row["genotype_patient_ID"]), - "contact_email": contact_email, - "phasing": bool(row["phasing"]), - "chromosome": str(row["chromosome"]), - "start_position": int(row["start_position"]), - "end_position": int(row["end_position"]), - "reference": str(row["reference"]), - "alternate": str(row["alternate"]), - "gene_symbol": str(row["gene_symbol"]), - "hgvsg": str(row["hgvsg"]), - "hgvsc": str(row["hgvsc"]), - "hgvsp": str(row["hgvsp"]), - "zygosity": ZYGOSITY_MAP[zygosity_type], - "inheritance": INHERITANCE_MAP[inheritance_type], - } - try: - records.append(Genotype(**kwargs)) - except (ValueError, TypeError) as e: - notepad.add_error(f"Sheet {sheet_name!r}, row {idx}: {e}") - return records + except (ValueError, TypeError) as e: + notepad.add_error(f"Sheet {sheet_name!r}: {e}") + return [], [] # treat any construction error as fatal for this row - def _map_phenotype( - self, sheet_name: str, df: pd.DataFrame, notepad: Notepad - ) -> list[Phenotype]: - records: list[Phenotype] = [] - # Collect every HPO ID in this sheet, so we can validate propagation later: - all_ids: list[hpotk.TermId] = [] + return genotypes, [] # no batch IDs for genotypes (yet) - for idx, row in df.iterrows(): - # normalize phenotype fields into valid strings - hpo_cell = str(row["hpo_id"]).strip() - # Parse optional label and digits - # extract the last token (it should just be the HPO code), case‑insensitive - m = re.match( - r""" - ^\s* - (?P