diff --git a/README.md b/README.md
index 82e5221..19a6639 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # P6
 **Peter's Parse and Processing of Prenatal Particulars via Pandas**
 
-A simple, extensible CLI for downloading the Human Phenotype Ontology, parsing genotype/phenotype Excel workbooks, and producing [GA4GH Phenopackets](https://phenopacket-schema.readthedocs.io/en/latest/schema.html#version-2-0) as specified [here](https://phenopacket-schema.readthedocs.io/_/downloads/en/stable/pdf/).
+A simple, extensible CLI for downloading the Human Phenotype Ontology, parsing genotype/phenotype Excel workbooks, and producing [GA4GH Phenopackets](https://phenopacket-schema.readthedocs.io/en/latest/schema.html#version-2-0) as specified [here](https://phenopacket-schema.readthedocs.io/_/downloads/en/stable/pdf/). This project enables downloading the latest or specified Human Phenotype Ontology (HPO) JSON release, auto-classifying Excel sheets as genotype or phenotype data, normalizing column names and HPO IDs, and writing one Phenopacket per record. Additional commands provide quick auditing of workbooks for header normalization, sheet classification, and required variant columns. Built for easy integration and reproducibility, P6 supports rapid phenotypic data preparation for research and clinical workflows, and runs locally with simple installation via pip. The end usage of this project is to convert an existing digital record of phenotypic data into phenopackets, such that they may be linked to their corresponding VCFs and used to integrate with a larger federated repository system.
 
 ## Table of Contents
 
@@ -10,10 +10,12 @@ A simple, extensible CLI for downloading the Human Phenotype Ontology, parsing g
 3. [Installation](#installation)  
 4. [Quickstart](#quickstart)  
    - [Download HPO JSON](#download-hpo-json)  
-   - [Parse Excel to Phenopackets](#parse-excel-to-phenopackets)  
+   - [Parse Excel to Phenopackets](#parse-excel-to-phenopackets)
+   - [Audit Excel Workbooks](#audit-excel-workbooks)
 5. [CLI Reference](#cli-reference)  
    - [`p6 download`](#p6-download)  
    - [`p6 parse-excel`](#p6-parse-excel)  
+   - [`p6 audit-excel`](#p6-audit-excel)
 6. [Development & Testing](#development--testing)  
 7. [Contributing](#contributing)  
 8. [License](#license)  
@@ -94,6 +96,18 @@ Resulting phenopacket files will be under:
 phenopacket_from_excel/$(date "+%Y-%m-%d_%H-%M-%S")/phenopackets/
 ```
 
+### Audit Excel Workbooks
+
+Quickly check each sheet in an Excel file for header normalization, sheet classification, and presence of required variant columns.
+```bash
+p6 audit-excel -e tests/data/Sydney_Python_transformation.xlsx
+```
+
+By default you get a table; use `-r` for a JSON output to the console.
+```bash
+p6 audit-excel -e tests/data/Sydney_Python_transformation.xlsx -r
+```
+
 ## CLI Reference
 
 ### p6 download
@@ -101,11 +115,13 @@ phenopacket_from_excel/$(date "+%Y-%m-%d_%H-%M-%S")/phenopackets/
 Usage:
 ```markdown
 p6 download [OPTIONS]
+```
 
 Options:
-    -d, --data-path PATH    where to save HPO JSON (default: tests/data)
-    -v, --hpo-version TEXT  exact HPO release tag (e.g. 2025-03-03 or v2025-03-03)
-    --help                  Show this help message and exit.
+```markdown
+    -d, --data-path PATH        where to save HPO JSON (default: tests/data)
+    -v, --hpo-version TEXT      exact HPO release tag (e.g. 2025-03-03 or v2025-03-03)
+    --help                      Show this help message and exit.
 ```
 
 Examples:
@@ -130,9 +146,9 @@ Usage: `p6 parse-excel [OPTIONS] EXCEL_FILE`
 
 Options:
 ```markdown
-    -e, --excel-path FILE    path to the Excel workbook  [required]
-    -hpo, --custom-hpo FILE  path to a custom HPO JSON file (defaults to `tests/data/hp.json`)
-    --help                  Show this message and exit.
+    -e, --excel-path FILE       path to the Excel workbook  [required]
+    -hpo, --custom-hpo FILE     path to a custom HPO JSON file (defaults to `tests/data/hp.json`)
+    --help                      Show this message and exit.
 ```
 
 Example:
@@ -142,6 +158,19 @@ Explicitly point at a custom HPO file:
 p6 parse-excel -e tests/data/Sydney_Python_transformation.xlsx -hpo src/P6/hp.json
 ```
 
+### p6 audit-excel
+
+Run a lightweight audit on each sheet in an Excel workbook, reporting header counts, sheet classification, and missing variant‐column checks.
+
+Usage: `p6 audit-excel [OPTIONS] EXCEL_FILE`
+
+Options:
+```markdown
+    -e, --excel-path FILE   path to the Excel workbook  [required]
+    -r, --report-json       output audit report as JSON instead of table
+    --help                  Show this message and exit.
+```
+
 ## Development & Testing
 
 Install dev requirements:
diff --git a/src/P6/__main__.py b/src/P6/__main__.py
index 8ac8f4b..1eb71e9 100644
--- a/src/P6/__main__.py
+++ b/src/P6/__main__.py
@@ -6,13 +6,14 @@
 
 import click
 import hpotk
+import json
 import pandas as pd  # Not needed for Pandas_Workaround, i.e. don't call declare or call "_read_sheets" at all, just use `tables = load_sheets_as_tables(excel_file)` which only needs `from .loader import load_sheets_as_tables`
 import pathlib
 import requests
 import sys
 import typing
 
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 from datetime import datetime
 from google.protobuf.json_format import MessageToJson
 from stairval.notepad import create_notepad
@@ -22,6 +23,8 @@
 from .loader import load_sheets_as_tables
 from .mapper import DefaultMapper
 
+AuditEntry = namedtuple("AuditEntry", ["step", "sheet", "message", "level"])
+
 
 @click.group()
 def main():
@@ -29,6 +32,52 @@ def main():
     pass
 
 
+@main.command(name="audit-excel")
+@click.option(
+    "-e",
+    "--excel-path",
+    "excel_file",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False),
+    help="path to the Excel workbook",
+)
+@click.option(
+    "-r",
+    "--report-json",
+    "report_json",
+    is_flag=True,
+    help="output audit report as JSON instead of table",
+)
+def audit_excel(excel_file: str, report_json: bool):
+    """
+    Run a preprocessing audit on each sheet in the given workbook:
+        - header normalization
+        - sheet classification (genotype/phenotype/skip)
+        - variant‐column presence checks
+    """
+    # 1) Read sheets
+    tables = _read_sheets(excel_file)
+
+    # 2) Produce audit entries
+    from .__main__ import preprocess
+
+    entries = preprocess(tables)
+
+    # 3) Render report
+    if report_json:
+        # turn each AuditEntry into a serializable dict
+        payload = [
+            {"step": e.step, "sheet": e.sheet, "level": e.level, "message": e.message}
+            for e in entries
+        ]
+        click.echo(json.dumps(payload, indent=2))
+    else:
+        # table header
+        click.echo(f"{'SHEET':20}  {'STEP':25}  {'LEVEL':8}  MESSAGE")
+        for e in entries:
+            click.echo(f"{e.sheet:20}  {e.step:25}  {e.level:8}  {e.message}")
+
+
 @main.command(name="download")
 @click.option(
     "-d",
@@ -46,7 +95,6 @@ def main():
     help="exact HPO release tag (e.g. 2025-03-03 or v2025-03-03)",
 )
 def download(data_dir: str, hpo_version: typing.Optional[str]):
-    # TODO: download an HPO
     """
     Download a specific or the latest HPO JSON release into the tests/data/ folder.
     """
@@ -94,7 +142,20 @@ def download(data_dir: str, hpo_version: typing.Optional[str]):
     type=click.Path(exists=True, dir_okay=False),
     help="path to a custom HPO JSON file (defaults to tests/data/hp.json)",
 )
-def parse_excel(excel_file: str, hpo_path: typing.Optional[str] = None):
+@click.option(
+    "--strict-variants/--no-strict-variants",
+    default=False,
+    help=("Treat raw↔HGVS mismatches as errors (default: warn)."),
+)
+@click.option(
+    "--verbose", is_flag=True, help="Show preprocessing and classification steps"
+)
+def parse_excel(
+    excel_file: str,
+    hpo_path: typing.Optional[str] = None,
+    verbose: bool = False,
+    strict_variants: bool = False,
+):
     """
     Read each sheet, check column order, then:
       - Identify as a Genotype sheet if ALL GENOTYPE_KEY_COLUMNS are present.
@@ -107,41 +168,67 @@ def parse_excel(excel_file: str, hpo_path: typing.Optional[str] = None):
 
     # 2) Build ontology and mapper
     ontology = _load_ontology(str(hpo_file))
-    mapper = DefaultMapper(ontology)
+    mapper = DefaultMapper(ontology, strict_variants=strict_variants)
 
     # 3) Read all sheets into DataFrames
     tables = _read_sheets(excel_file)
     # tables = load_sheets_as_tables(excel_file)  # Just use this for Pandas_Workaround. Don't call declare or call "_read_sheets" at all. Just use `tables = load_sheets_as_tables(excel_file)` which only needs `from .loader import load_sheets_as_tables`
     # TODO: Decide if it is better to implement `Pandas_Workaround` or just use Pandas
 
+    # optionally audit preprocessing
+    if verbose:
+        for entry in preprocess(tables):
+            # click.echo(f"[{entry.level.upper():7}] {entry.step:20} {entry.sheet:15} {entry.message}")
+            # indent every line…
+            indent = "              "
+            line = f"{entry.step:20} {entry.sheet:15} {entry.message}"
+            # color by level
+            click.echo("")  # blank line before mapping output
+            if entry.level == "error":
+                colored = click.style(line, fg="red")
+            elif entry.level in ("warn", "warning"):
+                colored = click.style(line, fg="yellow")
+            else:
+                colored = click.style(line, fg="cyan")
+            click.echo(indent + colored)
+        click.echo("")  # a blank line before mapping output
+
     # 4) Apply mapping to get raw records and collect issues
     notepad = create_notepad("phenopackets")
-    genotype_records, phenotype_records = mapper.apply_mapping(tables, notepad)
+    phenopackets = mapper.apply_mapping(tables, notepad)
+    # Refactor: mapper returns list[Phenopacket]; counts are exposed via mapper.stats.
+
+    # apply_mapping.8) Serialize phenopackets per patient
+    # phenopackets = mapper.apply_mapping(tables, notepad)
+    output_dir = _prepare_output_dir()
+    count = 0
+    for pkt in phenopackets:
+        with open(output_dir / f"{count + 1}.json", "w", encoding="utf-8") as out_f:
+            out_f.write(MessageToJson(pkt))
+        count += 1
+        # Use mapper.stats["patients"] instead of len(records_by_patient)
+    # apply_mapping.9) Final summary
+    click.echo(
+        f"Wrote {mapper.stats.get('patients', count)} phenopacket files to {output_dir}"
+    )
+    # TODO: Come back and add more top-level fields
 
     # 5) Report any errors or warnings
     _report_issues(notepad)
 
-    # pps = mapper.apply_mapping(all_sheets, notepad)
-    # assert not notepad.has_errors_or_warnings(include_subsections=True)
-    # TODO: write phenopackets to a folder
-    # click.echo(f"Created {len(pps)} Phenotype objects")
-
     # 6) Group results by patient
-    records_by_patient = _group_records_by_patient(genotype_records, phenotype_records)
-
     # 7) Prepare output directory with timestamp
-    # Will contain genotype and phenotype records as JSON
-    generated_phenopacket_output_dir = _prepare_output_dir()
-
     # 8) Serialize phenopackets per patient
-    _write_phenopackets(records_by_patient, generated_phenopacket_output_dir)
-
     # 9) Final summary
-    click.echo(
-        f"Wrote {len(records_by_patient)} phenopacket files to {generated_phenopacket_output_dir}"
-    )
-    click.echo(f"Created {len(genotype_records)} Genotype objects")
-    click.echo(f"Created {len(phenotype_records)} Phenotype objects")
+    # click.echo(f"Wrote {len(records_by_patient)} phenopacket files to {generated_phenopacket_output_dir}")
+    # click.echo(f"Created {len(genotype_records)} Genotype objects")
+    # click.echo(f"Created {len(phenotype_records)} Phenotype objects")
+    # Maintain exact lines expected by tests:
+    counts = getattr(mapper, "stats", {})
+    click.echo(f"Created {counts.get('genotypes', 0)} Genotype objects")
+    click.echo(f"Created {counts.get('phenotypes', 0)} Phenotype objects")
+    # TODO: (For printing other counts, I need to come back and mirror the same pattern:
+    #    counts.get('diseases', 0), counts.get('measurements', 0), counts.get('biosamples', 0))
 
 
 def _locate_hpo_file(hpo_path: typing.Optional[str]) -> pathlib.Path:
@@ -181,17 +268,37 @@ def _report_issues(notepad):
 
 
 def _group_records_by_patient(
-    genotype_records: list, phenotype_records: list
+    genotype_records: list,
+    phenotype_records: list,
+    disease_records: list,
+    measurement_records: list,
+    biosample_records: list,
 ) -> dict[str, dict[str, list]]:
     # Group genotype & phenotype records by patient ID
-    records = defaultdict(lambda: {"genotype_records": [], "phenotype_records": []})
+    records = defaultdict(
+        lambda: {
+            "genotype_records": [],
+            "phenotype_records": [],
+            "disease_records": [],
+            "measurement_records": [],
+            "biosample_records": [],
+        }
+    )
     for genotype in genotype_records:
         records[genotype.genotype_patient_ID]["genotype_records"].append(genotype)
     for phenotype in phenotype_records:
         records[phenotype.phenotype_patient_ID]["phenotype_records"].append(phenotype)
+    for disease in disease_records:
+        records[disease.patient_ID]["disease_records"].append(disease)
+    for measurement in measurement_records:
+        records[measurement.patient_ID]["measurement_records"].append(measurement)
+    for biosample in biosample_records:
+        records[biosample.patient_ID]["biosample_records"].append(biosample)
     return records
 
 
+# 7) Prepare output directory with timestamp
+# Will contain genotype and phenotype records as JSON
 def _prepare_output_dir() -> pathlib.Path:
     # use YYYY-MM-DD_HH-MM-SS for human-readable timestamps
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@@ -240,9 +347,7 @@ def _write_phenopackets(
                 genomic_interpretation_entry.InterpretationStatus.CONTRIBUTORY
             )
 
-            # now fill in the VariationDescriptor
-            # TODO: set this up later
-            # Omit setting gene_context for now.
+            # TODO: Revise VariationDescriptor and gene_context later, omit setting gene_context for now.
             # variation_descriptor = genomic_interpretation_entry.variant_interpretation.variation_descriptor
             # we can also set variation_descriptor.gene_context and variation_descriptor.allelic_state here then serialize out as before
             # variation_descriptor.gene_context.gene_symbol = genotype_record.gene_symbol
@@ -253,13 +358,13 @@ def _write_phenopackets(
             variation_descriptor = variant_interpretation.variation_descriptor
 
             # 1) Gene symbol & allelic state
-            # 'gene_context' is a message; you must CopyFrom if setting a message,
-            # but for its scalar fields you can still assign directly:
+            # 'gene_context' is a message; we need to CopyFrom if setting a message,
+            # but for its scalar fields we can still assign directly:
             variation_descriptor.gene_context.symbol = genotype_record.gene_symbol
             variation_descriptor.allelic_state.CopyFrom(
                 pps2.OntologyClass(
                     id="GENO:"
-                    + genotype_record.zygosity_code,  # or however you construct this
+                    + genotype_record.zygosity_code,  # or however we decide to construct this later on
                     label=genotype_record.zygosity,
                 )
             )
@@ -291,11 +396,24 @@ def _write_phenopackets(
                 # some protobuffs give trouble when trying to expose location/alleles so just skip
                 pass
 
-            # TODO: when ready, add an Expression.HGVS here
-            # Record the HGVS genomic notation as an Expression
-            # expr = variation_descriptor.expressions.add()
-            # expr.syntax = Phenopacket.Diagnosis.GenomicInterpretation.VariantInterpretation.VariationDescriptor.Expression.HGVS
-            # expr.value = genotype_record.hgvsg
+        # 3c) Add optional entries (if any):
+        for d in patient_data["disease_records"]:
+            ds = phenopacket.diseases.add()
+            ds.term.id = d.disease_term
+            ds.term.label = d.disease_label
+            ds.onset = d.disease_onset
+            ds.status = d.disease_status
+        for m in patient_data["measurement_records"]:
+            meas = phenopacket.measurements.add()
+            meas.type.id = m.measurement_type
+            meas.value = m.measurement_value
+            meas.unit = m.measurement_unit
+            meas.timestamp = m.measurement_timestamp
+        for b in patient_data["biosample_records"]:
+            bs = phenopacket.biosamples.add()
+            bs.id = b.biosample_id
+            bs.type.id = b.biosample_type
+            bs.collection_time = b.collection_date
 
         # 3d) Serialize to JSON
         generated_phenopacket_output_path = (
@@ -305,5 +423,69 @@ def _write_phenopackets(
             out_f.write(MessageToJson(phenopacket))
 
 
+def preprocess(tables: dict[str, pd.DataFrame]) -> list[AuditEntry]:
+    """
+    Run lightweight audits on each sheet:
+      - header normalization
+      - sheet classification
+      - variant‐column presence (raw vs HGVS)
+    """
+    from .mapper import (
+        RAW_VARIANT_COLUMNS,
+        HGVS_VARIANT_COLUMNS,
+        GENOTYPE_BASE_COLUMNS,
+        PHENOTYPE_KEY_COLUMNS,
+    )
+
+    entries: list[AuditEntry] = []
+
+    # Step 1: header counts
+    for name, df in tables.items():
+        entries.append(
+            AuditEntry(
+                step="normalize-headers",
+                sheet=name,
+                message=f"{len(df.columns)} cols",
+                level="info",
+            )
+        )
+
+    # Step 2: classify
+    for name, df in tables.items():
+        cols = set(df.columns)
+        has_raw = RAW_VARIANT_COLUMNS.issubset(cols)
+        has_hgvs = bool(HGVS_VARIANT_COLUMNS & cols)
+        is_gen = GENOTYPE_BASE_COLUMNS.issubset(cols) and (has_raw or has_hgvs)
+        is_pheno = PHENOTYPE_KEY_COLUMNS.issubset(cols)
+
+        kind = "genotype" if is_gen else "phenotype" if is_pheno else "skip"
+        entries.append(
+            AuditEntry(
+                step="classify-sheet",
+                sheet=name,
+                message=kind
+                + (
+                    f" ({'raw+hgvs' if has_raw and has_hgvs else 'raw' if has_raw else 'hgvs'})"
+                ),
+                level="info",
+            )
+        )
+
+    # Step 3: variant columns
+    for name, df in tables.items():
+        cols = set(df.columns)
+        if GENOTYPE_BASE_COLUMNS.issubset(cols):
+            if not (RAW_VARIANT_COLUMNS.issubset(cols) or HGVS_VARIANT_COLUMNS & cols):
+                entries.append(
+                    AuditEntry(
+                        step="variant-check",
+                        sheet=name,
+                        message="missing raw & HGVS",
+                        level="error",
+                    )
+                )
+    return entries
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/P6/biosample.py b/src/P6/biosample.py
new file mode 100644
index 0000000..13dbbf6
--- /dev/null
+++ b/src/P6/biosample.py
@@ -0,0 +1,25 @@
+"""
+Biosample domain model.
+
+Defines the BiosampleRecord dataclass for capturing sample metadata.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class BiosampleRecord:
+    """
+    Represents a biosample entry for a patient.
+
+    Attributes:
+        patient_ID: Unique alphanumeric patient identifier.
+        biosample_id: Unique identifier for the biosample.
+        biosample_type: CURIE of the tissue or sample type (e.g. 'UBERON:0002107').
+        collection_date: Date string in 'YYYY-MM-DD' format.
+    """
+
+    patient_ID: str
+    biosample_id: str
+    biosample_type: str
+    collection_date: str
diff --git a/src/P6/disease.py b/src/P6/disease.py
new file mode 100644
index 0000000..d044623
--- /dev/null
+++ b/src/P6/disease.py
@@ -0,0 +1,27 @@
+"""
+Disease domain model.
+
+Defines the DiseaseRecord dataclass for capturing disease annotations.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class DiseaseRecord:
+    """
+    Represents a disease entry for a patient.
+
+    Attributes:
+        patient_ID: Unique alphanumeric patient identifier.
+        disease_term: CURIE of the disease term (e.g. 'OMIM:266600').
+        disease_label: Human-readable label for the disease.
+        disease_onset: Date string in 'YYYY-MM-DD' format.
+        disease_status: True if the disease is present, False if excluded.
+    """
+
+    patient_ID: str
+    disease_term: str
+    disease_label: str
+    disease_onset: str
+    disease_status: bool
diff --git a/src/P6/loader.py b/src/P6/loader.py
index a353d95..6cc8f55 100644
--- a/src/P6/loader.py
+++ b/src/P6/loader.py
@@ -13,6 +13,17 @@
     "hpo": "hpo_id",
     "hpo_term": "hpo_id",  # also catch "HPO Term" → hpo_term → hpo_id
     "timestamp": "date_of_observation",
+    "disease_term": "disease_term",
+    "disease_label": "disease_label",
+    "disease_onset": "disease_onset",
+    "disease_status": "disease_status",
+    "measurement_type": "measurement_type",
+    "measurement_value": "measurement_value",
+    "measurement_unit": "measurement_unit",
+    "measurement_timestamp": "measurement_timestamp",
+    "biosample_id": "biosample_id",
+    "biosample_type": "biosample_type",
+    "collection_date": "collection_date",
 }
 
 
diff --git a/src/P6/mapper.py b/src/P6/mapper.py
index 169967d..c17e9b9 100644
--- a/src/P6/mapper.py
+++ b/src/P6/mapper.py
@@ -12,20 +12,23 @@
 import re
 import typing
 
+from collections import defaultdict
+from dataclasses import dataclass
 from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket
 from stairval.notepad import Notepad
+from typing import List, TypeVar, Tuple
 
+from .biosample import BiosampleRecord
+from .disease import DiseaseRecord
 from .genotype import Genotype
+from .measurement import MeasurementRecord
 from .phenotype import Phenotype
 
+import phenopackets.schema.v2 as pps2
 
-class TableMapper(metaclass=abc.ABCMeta):
-    @abc.abstractmethod
-    def apply_mapping(
-        self, tables: dict[str, pd.DataFrame], notepad: Notepad
-    ) -> typing.Sequence[Phenopacket]:
-        pass
-
+T = TypeVar("T")
+RowParseResult = Tuple[List[T], List[hpotk.TermId]]
+# gives us one consistent return shape: (parsed_items, aux_ids_for_batch_validation)
 
 # For any renamed field, the two neighbors it must sit between
 EXPECTED_COLUMN_NEIGHBORS = {
@@ -55,6 +58,11 @@ def apply_mapping(
 
 PHENOTYPE_KEY_COLUMNS = {"hpo_id", "date_of_observation", "status"}
 
+# Key columns to identify additional sheets
+DISEASE_KEY_COLUMNS = {"disease_term", "disease_onset"}
+MEASUREMENT_KEY_COLUMNS = {"measurement_type", "measurement_value", "measurement_unit"}
+BIOSAMPLE_KEY_COLUMNS = {"biosample_id", "biosample_type", "collection_date"}
+
 # Map raw zygosity abbreviations to allowed dataclass zygosity values
 ZYGOSITY_MAP = {
     "het": "heterozygous",
@@ -71,43 +79,112 @@ def apply_mapping(
     "denovo": "de_novo_mutation",
 }
 
+# Variant column groups used for validation and HGVS↔raw consistency checks
+RAW_VARIANT_COLUMNS = {
+    "chromosome",
+    "start_position",
+    "end_position",
+    "reference",
+    "alternate",
+}
+HGVS_VARIANT_COLUMNS = {"hgvsg", "hgvsc", "hgvsp"}
+# minimal base columns to call something a genotype sheet (we bring the index in later)
+GENOTYPE_BASE_COLUMNS = {"contact_email", "phasing"}
+
+# Friendly aliases → reduces friction while keeping behavior explicit
+KNOWN_SHEET_ALIASES: dict[str, set[str]] = {
+    "genotype": {"genotype", "variants", "variant", "geno"},
+    "phenotype": {"phenotype", "hpo", "pheno"},
+    "diseases": {"disease", "diseases"},
+    "measurements": {"measurement", "measurements", "labs"},
+    "biosamples": {"biosample", "biosamples", "samples"},
+}
+
+
+@dataclass
+class TypedTables:
+    """
+    Explicit, typed access to workbook sheets.
+    Any field can be `None`, meaning that the sheet not provided.
+    """
+
+    genotype: pd.DataFrame | None
+    phenotype: pd.DataFrame | None
+    diseases: pd.DataFrame | None
+    measurements: pd.DataFrame | None
+    biosamples: pd.DataFrame | None
 
-class DefaultMapper(TableMapper):
-    def __init__(self, hpo: hpotk.MinimalOntology):
-        self._hpo = hpo
 
+class TableMapper(metaclass=abc.ABCMeta):
+    @abc.abstractmethod
     def apply_mapping(
         self, tables: dict[str, pd.DataFrame], notepad: Notepad
-    ) -> tuple[list[Genotype], list[Phenotype]]:
-        genotype_records: list[Genotype] = []
-        phenotype_records: list[Phenotype] = []
-
-        for sheet_name, df in tables.items():
-            columns = set(df.columns)
-            """Send each sheet to the right extractor and collect all records."""
-            is_genotype_sheet = GENOTYPE_KEY_COLUMNS.issubset(columns)
-            is_phenotype_sheet = PHENOTYPE_KEY_COLUMNS.issubset(columns)
-
-            if is_genotype_sheet == is_phenotype_sheet:
-                # ambiguous sheet should give a warning instead of an error
-                notepad.add_warning(
-                    f"Skipping {sheet_name!r}: cannot unambiguously classify as genotype or phenotype"
-                )
-                continue
+    ) -> typing.Sequence[Phenopacket]:
+        # return fully-assembled Phenopacket messages, not intermediate parts.
+        raise NotImplementedError
 
-            # rename the former-index column
-            working = self._prepare_sheet(df, is_genotype_sheet)
 
-            if is_genotype_sheet:
-                genotype_records.extend(
-                    self._map_genotype(sheet_name, working, notepad)
-                )
-            else:
-                phenotype_records.extend(
-                    self._map_phenotype(sheet_name, working, notepad)
-                )
+class DefaultMapper(TableMapper):
+    def __init__(self, hpo: hpotk.MinimalOntology, strict_variants: bool = False):
+        """
+        - False: raw⇄HGVS mismatches are logged as WARNINGS
+        - True : raw⇄HGVS mismatches are logged as ERRORS
+        """
+        self._hpo = hpo
+        self.strict_variants = strict_variants
 
-        return genotype_records, phenotype_records
+    def apply_mapping(
+        self, tables: dict[str, pd.DataFrame], notepad: Notepad
+    ) -> list[Phenopacket]:
+        """
+        Process:
+        1) choose/validate input tables
+        2) map rows to domain records
+        3) group records per patient
+        4) construct Phenopacket per patient
+        5) return the list of packets
+
+        """
+        # TODO: implement the placeholders I am going to temporarily call
+        # Map each selected sheet to domain-specific records via the table-level wrappers
+        # The wrappers handle index→patient id normalization and any sheet-level checks then delegate to the row mappers.
+        typed_tables = self._choose_named_tables(tables, notepad)
+        genotype_records = self._map_genotype_table(typed_tables.genotype, notepad)
+        phenotype_records = self._map_phenotype_table(typed_tables.phenotype, notepad)
+        disease_records = self._map_diseases_table(typed_tables.diseases, notepad)
+        measurement_records = self._map_measurements_table(
+            typed_tables.measurements, notepad
+        )
+        biosample_records = self._map_biosamples_table(typed_tables.biosamples, notepad)
+
+        # apply_mapping.6) Group results by patient
+        grouped = self._group_records_by_patient(
+            genotype_records,
+            phenotype_records,
+            disease_records,
+            measurement_records,
+            biosample_records,
+        )
+
+        packets: list[Phenopacket] = [
+            self.construct_phenopacket_for_patient(patient_id, bundle, notepad)
+            for patient_id, bundle in grouped.items()
+        ]
+
+        # Back-compatability for CLI/tests:
+        # Expose simple counts without changing the return type.
+        # The CLI prints "Created N Genotype objects" / "Created N Phenotype objects"
+        # and the tests assert on those exact lines.
+        self.stats = {
+            "genotypes": len(genotype_records),
+            "phenotypes": len(phenotype_records),
+            "diseases": len(disease_records),
+            "measurements": len(measurement_records),
+            "biosamples": len(biosample_records),
+            "patients": len(grouped),
+        }
+
+        return packets
 
     def _prepare_sheet(self, df: pd.DataFrame, is_genotype: bool) -> pd.DataFrame:
         """Bring the index into a column and name it appropriately."""
@@ -116,142 +193,237 @@ def _prepare_sheet(self, df: pd.DataFrame, is_genotype: bool) -> pd.DataFrame:
         original = working.columns[0]
         return working.rename(columns={original: column_id})
 
-    def _map_genotype(
-        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
-    ) -> list[Genotype]:
-        records: list[Genotype] = []
-        for idx, row in df.iterrows():
-            # handle slash‑separated zygosity and inheritance
-            list_of_zygosity_types = [
-                z.strip().lower() for z in str(row["zygosity"]).split("/")
-            ]
-            list_of_inheritance_types = [
-                i.strip().lower() for i in str(row["inheritance"]).split("/")
-            ]
-            for zygosity_type, inheritance_type in zip(
-                list_of_zygosity_types, list_of_inheritance_types
-            ):
-                if zygosity_type not in ZYGOSITY_MAP:
-                    notepad.add_error(
-                        f"Sheet {sheet_name!r}: Unrecognized zygosity code {zygosity_type!r}"
-                    )
-                if inheritance_type not in INHERITANCE_MAP:
-                    notepad.add_error(
-                        f"Sheet {sheet_name!r}: Unrecognized inheritance code {inheritance_type!r}"
-                    )
-                # allow missing/NaN contact_email → substitute dummy
-                raw_email = row["contact_email"]
-                contact_email = (
-                    "unknown@example.com"
-                    if pd.isna(raw_email)
-                    else str(raw_email).strip()
-                )
-                kwargs = {
-                    "genotype_patient_ID": str(row["genotype_patient_ID"]),
-                    "contact_email": contact_email,
-                    "phasing": bool(row["phasing"]),
-                    "chromosome": str(row["chromosome"]),
-                    "start_position": int(row["start_position"]),
-                    "end_position": int(row["end_position"]),
-                    "reference": str(row["reference"]),
-                    "alternate": str(row["alternate"]),
-                    "gene_symbol": str(row["gene_symbol"]),
-                    "hgvsg": str(row["hgvsg"]),
-                    "hgvsc": str(row["hgvsc"]),
-                    "hgvsp": str(row["hgvsp"]),
-                    "zygosity": ZYGOSITY_MAP[zygosity_type],
-                    "inheritance": INHERITANCE_MAP[inheritance_type],
-                }
-                try:
-                    records.append(Genotype(**kwargs))
-                except (ValueError, TypeError) as e:
-                    notepad.add_error(f"Sheet {sheet_name!r}, row {idx}: {e}")
-        return records
-
-    def _map_phenotype(
-        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
-    ) -> list[Phenotype]:
-        records: list[Phenotype] = []
-        # Collect every HPO ID in this sheet, so we can validate propagation later:
-        all_ids: list[hpotk.TermId] = []
-
-        for idx, row in df.iterrows():
-            # normalize phenotype fields into valid strings
-            hpo_cell = str(row["hpo_id"]).strip()
-            # Parse optional label and digits
-            # extract the last token (it should just be the HPO code), case‑insensitive
-            m = re.match(
-                r"""
-                ^\s*
-                (?P<label>.*?)              # optional label
-                \s*                         # whitespace
-                \(?                         # optional "("
-                (?:HP:?)?(?P<digits>\d+)    # digits, with optional "HP"
-                \)?                         # optional ")"
-                \s*$
-                """,
-                hpo_cell,
-                re.VERBOSE | re.IGNORECASE,
-            )
-            if not m:
+    @staticmethod
+    def _normalize_time_like(value: typing.Any) -> str:
+        """
+        Phenotype/measurement/biosample timestamps:
+        - numeric values are prefixed with 'T' (e.g., 20200101 -> 'T20200101')
+        - strings are trimmed; if not already prefixed with 'T', we add it
+        - empty/NaN -> empty string
+        """
+        # Handle None, NaN, NaT, pandas NA, and empty/whitespace-only strings
+        if (
+            value is None
+            or pd.isna(value)
+            or (isinstance(value, str) and not value.strip())
+        ):
+            return ""
+        if isinstance(value, (int, float)) and not isinstance(value, bool):
+            return f"T{int(value)}"
+        s = str(value).strip()
+        if not s:
+            return ""
+        return s if s.upper().startswith("T") else f"T{s}"
+
+    @staticmethod
+    def _to_bool(value: typing.Any) -> bool:
+        """
+        Robust boolean parsing:
+        - True for: 1, '1', 'true', 't', 'yes', 'y' (case-insensitive)
+        - False for: 0, '0', 'false', 'f', 'no', 'n', '', None
+        - Fallback: Python truthiness on other values (rare)
+        """
+        if isinstance(value, bool):
+            return value
+        if value is None:
+            return False
+        s = str(value).strip().lower()
+        if s in {"1", "true", "t", "yes", "y"}:
+            return True
+        if s in {"0", "false", "f", "no", "n", ""}:
+            return False
+        return bool(value)
+
+    @staticmethod
+    def parse_genotype_row(
+        row: pd.Series, sheet_name: str, notepad: Notepad
+    ) -> RowParseResult[Genotype]:
+        """
+        Parse a single genotype row into zero or more Genotype dataclass instances.
+        Returns ([], []) if validation fails for this row.
+        """
+        genotypes: list[Genotype] = []
+
+        # handle slash-separated zygosity and inheritance
+        list_of_zygosity_types = [
+            zygosity_entry.strip().lower()
+            for zygosity_entry in str(row.get("zygosity", "")).split("/")
+        ]
+        list_of_inheritance_types = [
+            inheritance_entry.strip().lower()
+            for inheritance_entry in str(row.get("inheritance", "")).split("/")
+        ]
+
+        # zip will truncate to the shorter of the two, matching the previous behavior
+        for zygosity_type, inheritance_type in zip(
+            list_of_zygosity_types, list_of_inheritance_types
+        ):
+            if zygosity_type not in ZYGOSITY_MAP:
                 notepad.add_error(
-                    f"Sheet {sheet_name!r}, row {idx}: Cannot parse HPO term+ID from {hpo_cell!r}"
+                    f"Sheet {sheet_name!r}: Unrecognized zygosity code {zygosity_type!r}"
                 )
-                continue
-
-            raw_label = m.group("label").strip()
-            digits = m.group("digits")
-            curie = f"HP:{digits.zfill(7)}"
-            term_id = hpotk.TermId.from_curie(curie)
-
-            # 1) Normalize the date_of_observation
-            # Normalize the date_of_observation
-            # if it's numeric, cast to int; else treat as string
-            raw_date = row["date_of_observation"]
-            if isinstance(raw_date, (int, float)):
-                date_str = f"T{int(raw_date)}"
-            else:
-                s = str(raw_date).strip()
-                date_str = s if s.upper().startswith("T") else f"T{s}"
-
-            # 2) Append the Phenotype record
-            # Append Phenotype record
-            records.append(
-                Phenotype(
-                    phenotype_patient_ID=str(row["phenotype_patient_ID"]),
-                    HPO_ID=curie,
-                    date_of_observation=date_str,
-                    status=bool(row["status"]),
+                return [], []  # bail on this row
+            if inheritance_type not in INHERITANCE_MAP:
+                notepad.add_error(
+                    f"Sheet {sheet_name!r}: Unrecognized inheritance code {inheritance_type!r}"
                 )
+                return [], []  # bail on this row
+
+            # allow missing/NaN contact_email → substitute dummy
+            raw_email = row.get("contact_email")
+            contact_email = (
+                "unknown@example.com" if pd.isna(raw_email) else str(raw_email).strip()
             )
 
-            # 3) The IDs must exist in the ontology:
-            # Validate ID against ontology
-            term = self._hpo.get_term(term_id)
-            if term is None:
-                notepad.add_warning(
-                    f"Skipping row {idx} in {sheet_name!r}: HPO ID {curie!r} not found in ontology"
-                )
-                continue
+            # Normalize chromosome: allow "16" but store "chr16"
+            chrom_raw = str(row.get("chromosome", "")).strip()
+            if not chrom_raw:
+                notepad.add_error(f"Sheet {sheet_name!r}: Missing chromosome")
+                return [], []
+            chrom = (
+                chrom_raw if chrom_raw.lower().startswith("chr") else f"chr{chrom_raw}"
+            )
 
+            try:
+                genotypes.append(
+                    Genotype(
+                        genotype_patient_ID=str(row["genotype_patient_ID"]),
+                        contact_email=contact_email,
+                        phasing=DefaultMapper._to_bool(row.get("phasing")),
+                        # chromosome=str(row["chromosome"]),
+                        chromosome=chrom,
+                        start_position=int(row["start_position"]),
+                        end_position=int(row["end_position"]),
+                        reference=str(row["reference"]),
+                        alternate=str(row["alternate"]),
+                        gene_symbol=str(row["gene_symbol"]),
+                        hgvsg=str(row["hgvsg"]),
+                        hgvsc=str(row["hgvsc"]),
+                        hgvsp=str(row["hgvsp"]),
+                        zygosity=ZYGOSITY_MAP[zygosity_type],
+                        inheritance=INHERITANCE_MAP[inheritance_type],
+                    )
+                )
+            except (ValueError, TypeError) as e:
+                notepad.add_error(f"Sheet {sheet_name!r}: {e}")
+                return [], []  # treat any construction error as fatal for this row
+
+        return genotypes, []  # no batch IDs for genotypes (yet)
+
+    @staticmethod
+    def parse_phenotype_row(
+        row: pd.Series, hpo: hpotk.MinimalOntology, sheet_name: str, notepad: Notepad
+    ) -> RowParseResult[Phenotype]:
+        """
+        Parse a single phenotype row into zero or more Phenotype dataclasses.
+        Also return any parsed TermIds so the caller can run batch validators later.
+        Returns ([], []) if critical validation fails.
+        """
+        phenotypes: list[Phenotype] = []
+        term_ids: list[hpotk.TermId] = []
+
+        # normalize phenotype fields into valid strings
+        hpo_cell = str(row.get("hpo_id", "")).strip()
+
+        # Gracefully handle common placeholder "NAD" (No Abnormality Detected):
+        # this is *not* an HPO term; skip the row and emit a warning instead of erroring out.
+        if hpo_cell.upper() == "NAD":
+            notepad.add_warning(
+                f"Sheet {sheet_name!r}: 'NAD' encountered – skipping phenotype row"
+            )
+            return [], []
+
+        # Parse optional label and digits; extract the last token (it should just be the HPO code), case-insensitive
+        m = re.match(
+            r"""
+            ^\s*
+            (?P<label>.*?)              # optional label
+            \s*                         # whitespace
+            \(?                         # optional "("
+            (?:HP:?)?(?P<digits>\d+)    # digits, with optional "HP"
+            \)?                         # optional ")"
+            \s*$
+            """,
+            hpo_cell,
+            re.VERBOSE | re.IGNORECASE,
+        )
+        if not m:
+            notepad.add_error(
+                f"Sheet {sheet_name!r}: Cannot parse HPO term+ID from {hpo_cell!r}"
+            )
+            return [], []
+
+        raw_label = m.group("label").strip()
+        digits = m.group("digits")
+        curie = f"HP:{digits.zfill(7)}"
+        term_id = hpotk.TermId.from_curie(curie)
+
+        # 1) Normalize the date_of_observation
+        # if it's numeric, cast to int; else treat as string
+        date_str = DefaultMapper._normalize_time_like(row.get("date_of_observation"))
+
+        # 2) Append the Phenotype record
+        try:
+            phenotype = Phenotype(
+                phenotype_patient_ID=str(row["phenotype_patient_ID"]),
+                HPO_ID=curie,
+                date_of_observation=date_str,
+                status=DefaultMapper._to_bool(row.get("status")),
+            )
+        except (ValueError, TypeError) as e:
+            notepad.add_error(f"Sheet {sheet_name!r}: {e}")
+            return [], []
+
+        phenotypes.append(phenotype)
+        term_ids.append(term_id)
+
+        # 3) The IDs must exist in the ontology:
+        term = hpo.get_term(term_id)
+        if term is None:
+            notepad.add_warning(
+                f"Sheet {sheet_name!r}: HPO ID {curie!r} not found in ontology"
+            )
+        else:
             # 4) If the term is obsolete, flag it:
             if term.is_obsolete:
                 replacements = ", ".join(str(t) for t in term.alt_term_ids)
                 notepad.add_warning(
-                    f"Sheet {sheet_name!r}, row {idx}: {curie!r} is obsolete; use {replacements}"
+                    f"Sheet {sheet_name!r}: {curie!r} is obsolete; use {replacements}"
                 )
-
             # 5) If they gave a label, check that it matches (case-insensitive):
             if raw_label and raw_label.lower() != term.name.lower():
                 notepad.add_warning(
-                    f"Sheet {sheet_name!r}, row {idx}: label {raw_label!r} "
-                    f"does not match ontology name {term.name!r}"
+                    f"Sheet {sheet_name!r}: label {raw_label!r} does not match ontology name {term.name!r}"
                 )
 
-            # Only now record for batch‐validation
-            all_ids.append(term_id)
+        return phenotypes, term_ids
 
-        # Bulk‐validate all collected IDs
+    def _map_genotype(
+        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
+    ) -> list[Genotype]:
+        records: list[Genotype] = []
+        for _, row in df.iterrows():
+            # Parse this row into zero or more Genotype records
+            row_records, _ = self.parse_genotype_row(row, sheet_name, notepad)
+            records.extend(row_records)
+        return records
+
+    def _map_phenotype(
+        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
+    ) -> list[Phenotype]:
+        records: list[Phenotype] = []
+        # Collect every HPO ID in this sheet, so we can validate propagation later:
+        all_ids: list[hpotk.TermId] = []
+
+        for _, row in df.iterrows():
+            row_records, row_term_ids = self.parse_phenotype_row(
+                row, self._hpo, sheet_name, notepad
+            )
+            # Each parser returns lists; extend the accumulators.
+            records.extend(row_records)
+            all_ids.extend(row_term_ids)
+
+        # Bulk-validate all collected IDs
         if all_ids:
             validators = [
                 ObsoleteTermIdsValidator(self._hpo),
@@ -268,3 +440,485 @@ def _map_phenotype(
                     notepad.add_warning(msg)
 
         return records
+
+    @staticmethod
+    def check_hgvs_consistency(
+        item: pd.Series, sheet_name: str, notepad: Notepad, strict: bool
+    ) -> None:
+        """
+        If both raw coordinates and HGVS notation are present, ensure that the genotype notations match
+        """
+        pattern = re.compile(
+            r"^(?:chr)?(?P<chromosome_name>[^:]+):g\.(?P<mutation_position>\d+)"
+            r"(?P<reference_allele>[ACGT]+)>(?P<alternative_allele>[ACGT]+)$",
+            re.IGNORECASE,
+        )
+        hgvs = str(item.get("hgvsg", "")).strip()
+        m = pattern.match(hgvs)
+        if not m:
+            notepad.add_error(
+                f"Sheet {sheet_name!r}: malformed HGVS g. notation {hgvs!r}"
+            )
+            return
+
+        # mismatch = (str(item["chromosome"]) != m.group("chromosome_name") or int(item["start_position"]) != int(m.group("mutation_position")) or int(item["end_position"]) != int(m.group("mutation_position")) or str(item["reference"]) != m.group("reference_allele") or str(item["alternate"]) != m.group("alternative_allele")
+
+        # Normalize cases and optional 'chr' prefix for robust comparison
+        chrom_cell = str(item["chromosome"]).strip().lower()
+        if chrom_cell.startswith("chr"):
+            chrom_cell = chrom_cell[3:]
+        chrom_hgvs = m.group("chromosome_name").strip().lower()
+
+        pos_hgvs = int(m.group("mutation_position"))
+        ref_cell = str(item["reference"]).strip().upper()
+        alt_cell = str(item["alternate"]).strip().upper()
+        ref_hgvs = m.group("reference_allele").upper()
+        alt_hgvs = m.group("alternative_allele").upper()
+
+        # mismatch = (chrom_cell != chrom_hgvs or int(item["start_position"]) != pos_hgvs or int(item["end_position"]) != pos_hgvs or ref_cell != ref_hgvs or alt_cell != alt_hgvs)
+        # Accept both SNV conventions:
+        # - 1-based exact: start == end == pos
+        # - BED-like SNV:  start == pos-1 and end == pos
+        start = int(item["start_position"])
+        end = int(item["end_position"])
+        snv_matches = (start == pos_hgvs and end == pos_hgvs) or (
+            start == pos_hgvs - 1 and end == pos_hgvs
+        )
+        mismatch = (
+            chrom_cell != chrom_hgvs
+            or not snv_matches
+            or ref_cell != ref_hgvs
+            or alt_cell != alt_hgvs
+        )
+
+        if mismatch:
+            msg = (
+                f"Sheet {sheet_name!r}: HGVS '{hgvs}' disagrees with "
+                f"raw ({item['chromosome']}:{item['start_position']}-"
+                f"{item['end_position']} {item['reference']}>{item['alternate']})"
+            )
+            (notepad.add_error if strict else notepad.add_warning)(msg)
+
+    def _prepare_sheet_for_patient(
+        self, df: pd.DataFrame, patient_id_column: str
+    ) -> pd.DataFrame:
+        """
+        Similar to _prepare_sheet, but used for sheets whose patient identifier column is named 'patient_ID' (diseases, measurements, biosamples).
+        This brings the current index into a named column so downstream mappers can access a consistent patient identifier.
+        """
+        working = df.reset_index()
+        original = working.columns[0]
+        return working.rename(columns={original: patient_id_column})
+
+    def _choose_named_tables(
+        self, tables: dict[str, pd.DataFrame], notepad: Notepad
+    ) -> TypedTables:
+        """
+        Prefer explicit sheet names (plus common aliases).
+        """
+
+        def by_alias(kind: str) -> pd.DataFrame | None:
+            aliases = KNOWN_SHEET_ALIASES[kind]
+            for sheet_name, df in tables.items():
+                if sheet_name.strip().casefold() in aliases:
+                    return df
+            return None
+
+        selected = TypedTables(
+            genotype=by_alias("genotype"),
+            phenotype=by_alias("phenotype"),
+            diseases=by_alias("diseases"),
+            measurements=by_alias("measurements"),
+            biosamples=by_alias("biosamples"),
+        )
+
+        # Hard-minimum: at least genotype or phenotype must exist
+        if selected.genotype is None and selected.phenotype is None:
+            notepad.add_error(
+                "Missing required sheet: either 'genotype' or 'phenotype'."
+            )
+
+        return selected
+
+    # Table-level wrapper mappers
+    def _map_genotype_table(
+        self, df: pd.DataFrame | None, notepad: Notepad
+    ) -> list[Genotype]:
+        """
+        Sheet-level wrapper for Genotype rows:
+          - normalize index to 'genotype_patient_ID'
+          - require all key genotype columns (raw + HGVS)
+          - optionally check HGVS vs raw coordinate consistency
+          - delegate row conversion to _map_genotype
+        """
+        if df is None:
+            return []
+        working = self._prepare_sheet(df, is_genotype=True)
+
+        have = set(working.columns)
+        # Row parser and Genotype dataclass expect these columns to exist:
+        missing = sorted(GENOTYPE_KEY_COLUMNS - have)
+        if missing:
+            notepad.add_error(f"Sheet 'genotype': missing required columns: {missing}")
+            return []
+        # Cross-check HGVS vs raw coordinates for every row (since both are present)
+        for _, row in working.iterrows():
+            self.check_hgvs_consistency(row, "genotype", notepad, self.strict_variants)
+
+        # Must have the base columns, plus EITHER raw coordinates OR at least one HGVS field.
+        # if not GENOTYPE_BASE_COLUMNS.issubset(have):
+        # missing = sorted(GENOTYPE_BASE_COLUMNS - have)
+        # notepad.add_error(f"Sheet 'genotype': missing required base columns: {missing}")
+        # return []
+        # if not (RAW_VARIANT_COLUMNS.issubset(have) or (HGVS_VARIANT_COLUMNS & have)):
+        # notepad.add_error("Sheet 'genotype': provide either all raw coordinates ", f"{sorted(RAW_VARIANT_COLUMNS)} or at least one HGVS column ", f"from {sorted(HGVS_VARIANT_COLUMNS)}")
+        # return []
+        ## If BOTH groups are present, cross-check consistency:
+        # columns_present = have
+        # if RAW_VARIANT_COLUMNS.issubset(columns_present) and (HGVS_VARIANT_COLUMNS & columns_present):
+        # for _, row in working.iterrows():
+        # self.check_hgvs_consistency(row, "genotype", notepad, self.strict_variants)
+
+        return self._map_genotype("genotype", working, notepad)
+
+    def _map_phenotype_table(
+        self, df: pd.DataFrame | None, notepad: Notepad
+    ) -> list[Phenotype]:
+        """
+        Sheet-level wrapper for Phenotype rows:
+          - normalize index to 'phenotype_patient_ID'
+          - delegate row conversion to _map_phenotype
+        """
+        if df is None:
+            return []
+        working = self._prepare_sheet(df, is_genotype=False)
+        missing = PHENOTYPE_KEY_COLUMNS - set(working.columns)
+        if missing:
+            notepad.add_error(
+                f"Sheet 'phenotype': missing expected columns: {sorted(missing)}"
+            )
+            return []
+
+        return self._map_phenotype("phenotype", working, notepad)
+
+    def _map_diseases_table(
+        self, df: pd.DataFrame | None, notepad: Notepad
+    ) -> list[DiseaseRecord]:
+        """
+        Sheet-level wrapper for Disease rows:
+          - normalize index to 'patient_ID'
+          - delegate row conversion to _map_disease
+        """
+        if df is None:
+            return []
+        working = self._prepare_sheet_for_patient(df, "patient_ID")
+        return self._map_disease("diseases", working, notepad)
+
+    def _map_measurements_table(
+        self, df: pd.DataFrame | None, notepad: Notepad
+    ) -> list[MeasurementRecord]:
+        """
+        Sheet-level wrapper for Measurement rows:
+          - normalize index to 'patient_ID'
+          - delegate row conversion to _map_measurement
+        """
+        if df is None:
+            return []
+        working = self._prepare_sheet_for_patient(df, "patient_ID")
+        return self._map_measurement("measurements", working, notepad)
+
+    def _map_biosamples_table(
+        self, df: pd.DataFrame | None, notepad: Notepad
+    ) -> list[BiosampleRecord]:
+        """
+        Sheet-level wrapper for Biosample rows:
+          - normalize index to 'patient_ID'
+          - delegate row conversion to _map_biosample
+        """
+        if df is None:
+            return []
+        working = self._prepare_sheet_for_patient(df, "patient_ID")
+        return self._map_biosample("biosamples", working, notepad)
+
+    def _map_disease(
+        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
+    ) -> list[DiseaseRecord]:
+        """
+        Map each row in a disease sheet to a DiseaseRecord.
+        Required columns: patient_ID, disease_term, disease_onset, disease_status.
+        Optional column: disease_label.
+        """
+        records: list[DiseaseRecord] = []
+        required_columns = {
+            "patient_ID",
+            "disease_term",
+            "disease_onset",
+            "disease_status",
+        }
+        missing = required_columns - set(df.columns)
+        if missing:
+            notepad.add_error(
+                f"Sheet {sheet_name!r}: missing required columns: {sorted(missing)}"
+            )
+            return records
+
+        for index, row in df.iterrows():
+            try:
+                disease_record = DiseaseRecord(
+                    patient_ID=str(row["patient_ID"]),
+                    disease_term=str(row["disease_term"]).strip(),
+                    disease_label=(str(row.get("disease_label", "")).strip() or None),
+                    disease_onset=str(row["disease_onset"]).strip(),
+                    disease_status=DefaultMapper._to_bool(row.get("disease_status")),
+                )
+                records.append(disease_record)
+            except (ValueError, TypeError) as exception:
+                notepad.add_error(f"Sheet {sheet_name!r}, row {index}: {exception}")
+        return records
+
+    def _map_measurement(
+        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
+    ) -> list[MeasurementRecord]:
+        """
+        Map each row in a measurement sheet to a MeasurementRecord.
+        Required columns: patient_ID, measurement_type, measurement_value, measurement_unit.
+        Optional column: measurement_timestamp (numeric values are prefixed with 'T' for consistency).
+        """
+        records: list[MeasurementRecord] = []
+        required_columns = {
+            "patient_ID",
+            "measurement_type",
+            "measurement_value",
+            "measurement_unit",
+        }
+        missing = required_columns - set(df.columns)
+        if missing:
+            notepad.add_error(
+                f"Sheet {sheet_name!r}: missing required columns: {sorted(missing)}"
+            )
+            return records
+
+        for index, row in df.iterrows():
+            try:
+                measurement_timestamp = (
+                    self._normalize_time_like(row.get("measurement_timestamp")) or None
+                )
+
+                measurement_record = MeasurementRecord(
+                    patient_ID=str(row["patient_ID"]),
+                    measurement_type=str(row["measurement_type"]).strip(),
+                    measurement_value=float(row["measurement_value"]),
+                    measurement_unit=str(row["measurement_unit"]).strip(),
+                    measurement_timestamp=measurement_timestamp,
+                )
+                records.append(measurement_record)
+            except (ValueError, TypeError) as exception:
+                notepad.add_error(f"Sheet {sheet_name!r}, row {index}: {exception}")
+        return records
+
+    def _map_biosample(
+        self, sheet_name: str, df: pd.DataFrame, notepad: Notepad
+    ) -> list[BiosampleRecord]:
+        """
+        Map each row in a biosample sheet to a BiosampleRecord.
+        Required columns: patient_ID, biosample_id, biosample_type, collection_date.
+        Numeric collection_date values are prefixed with 'T' for consistency with phenotype dates.
+        """
+        records: list[BiosampleRecord] = []
+        required_columns = {
+            "patient_ID",
+            "biosample_id",
+            "biosample_type",
+            "collection_date",
+        }
+        missing = required_columns - set(df.columns)
+        if missing:
+            notepad.add_error(
+                f"Sheet {sheet_name!r}: missing required columns: {sorted(missing)}"
+            )
+            return records
+
+        for index, row in df.iterrows():
+            try:
+                collection_date = (
+                    self._normalize_time_like(row.get("collection_date")) or ""
+                )
+
+                biosample_record = BiosampleRecord(
+                    patient_ID=str(row["patient_ID"]),
+                    biosample_id=str(row["biosample_id"]).strip(),
+                    biosample_type=str(row["biosample_type"]).strip(),
+                    collection_date=collection_date,
+                )
+                records.append(biosample_record)
+            except (ValueError, TypeError) as exception:
+                notepad.add_error(f"Sheet {sheet_name!r}, row {index}: {exception}")
+        return records
+
+    # Grouping and phenopacket construction
+    def _group_records_by_patient(
+        self,
+        genotype_records: list[Genotype],
+        phenotype_records: list[Phenotype],
+        disease_records: list[DiseaseRecord],
+        measurement_records: list[MeasurementRecord],
+        biosample_records: list[BiosampleRecord],
+    ) -> dict[str, dict[str, list]]:
+        """
+        Group all domain records by patient identifier, producing a bundle per patient
+        """
+        grouped = defaultdict(
+            lambda: {
+                "genotype_records": [],
+                "phenotype_records": [],
+                "disease_records": [],
+                "measurement_records": [],
+                "biosample_records": [],
+            }
+        )
+        for genotype in genotype_records:
+            grouped[genotype.genotype_patient_ID]["genotype_records"].append(genotype)
+        for phenotype in phenotype_records:
+            grouped[phenotype.phenotype_patient_ID]["phenotype_records"].append(
+                phenotype
+            )
+        for disease in disease_records:
+            grouped[disease.patient_ID]["disease_records"].append(disease)
+        for measurement in measurement_records:
+            grouped[measurement.patient_ID]["measurement_records"].append(measurement)
+        for biosample in biosample_records:
+            grouped[biosample.patient_ID]["biosample_records"].append(biosample)
+        return grouped
+
+    def construct_phenopacket_for_patient(
+        self, patient_id: str, bundle: dict[str, list], notepad: Notepad
+    ) -> Phenopacket:
+        """
+        Build a Phenopacket for a single patient using their grouped records.
+        Field assignments follow the explicit naming and serialization style.
+        """
+
+        phenopacket = Phenopacket()
+        phenopacket.id = patient_id
+        phenopacket.subject.id = patient_id
+
+        # 1) Phenotypic features
+        self._add_phenotypic_features(phenopacket, bundle.get("phenotype_records", []))
+
+        # 2) Genotype interpretations (minimal HGVS expression to start)
+        self._add_genotype_interpretations(
+            phenopacket, bundle.get("genotype_records", []), patient_id
+        )
+
+        # 3) Optional sections (diseases, measurements, biosamples).
+        # Keep assignments minimal and consistent with earlier CLI code.
+        self._add_diseases_to_packet(phenopacket, bundle.get("disease_records", []))
+        self._add_measurements_to_packet(
+            phenopacket, bundle.get("measurement_records", [])
+        )
+        self._add_biosamples_to_packet(phenopacket, bundle.get("biosample_records", []))
+
+        return phenopacket
+
+    # helper methods to keep complexity low and match explicit wiring style
+
+    @staticmethod
+    def _add_phenotypic_features(pkt, phenotypes: list) -> None:
+        """
+        Add PhenotypicFeature messages to the packet.
+        - Set term id
+        - Mark excluded if status is False
+        """
+        for phenotype in phenotypes:
+            feature = pkt.phenotypic_features.add()
+            feature.type.id = phenotype.HPO_ID
+            if not phenotype.status:
+                feature.excluded = True
+
+    @staticmethod
+    def _add_genotype_interpretations(pkt, genotypes: list, patient_id: str) -> None:
+        """
+        Add Interpretation → Diagnosis → GenomicInterpretation blocks.
+        Minimal VariationDescriptor with HGVS expression; set optional
+        location/alleles when supported by the installed protobufs.
+        """
+
+        for interpretation_index, genotype_record in enumerate(genotypes):
+            interpretation = pkt.interpretations.add()
+            interpretation.id = f"{patient_id}-interpretation-{interpretation_index}"
+            interpretation.progress_status = interpretation.ProgressStatus.COMPLETED
+
+            genomic_interpretation_entry = (
+                interpretation.diagnosis.genomic_interpretations.add()
+            )
+            genomic_interpretation_entry.subject_or_biosample_id = patient_id
+            genomic_interpretation_entry.interpretation_status = (
+                genomic_interpretation_entry.InterpretationStatus.CONTRIBUTORY
+            )
+
+            # VariationDescriptor with HGVS expression
+            variant_interpretation = genomic_interpretation_entry.variant_interpretation
+            variation_descriptor = variant_interpretation.variation_descriptor
+
+            expression = variation_descriptor.expressions.add()
+            # Attempt to set the HGVS syntax enum if available
+            try:
+                expression.syntax = pps2.VariationDescriptor.Expression.HGVS
+            except AttributeError:
+                pass
+            # expression.value = genotype_record.hgvsg or ""
+            # Canonicalize: serialize without optional 'chr' prefix so it matches
+            # expected '16:g.100A>G' style while still accepting either form as input.
+            hgvs = (genotype_record.hgvsg or "").strip()
+            if hgvs.lower().startswith("chr"):
+                hgvs = hgvs[3:]
+            expression.value = hgvs
+
+            # Optional: attempt to set a subset of location/alleles if supported
+            try:
+                location_context = variation_descriptor.location
+                location_context.interval.interval_type = (
+                    pps2.VariationDescriptor.Location.Interval.Type.EXACT
+                )
+                location_context.interval.start = genotype_record.start_position
+                location_context.interval.end = genotype_record.end_position
+                location_context.reference_sequence_id = genotype_record.chromosome
+                variation_descriptor.reference = genotype_record.reference
+                variation_descriptor.alternate = genotype_record.alternate
+            except AttributeError:
+                # Some library builds do not expose these submessages; skip gracefully.
+                pass
+
+    @staticmethod
+    def _add_diseases_to_packet(pkt, diseases: list) -> None:
+        """
+        Add Disease messages.
+        Only set term.id and (if present) term.label. Onset/status wiring can be
+        added later as needed.
+        """
+        for disease_record in diseases:
+            disease_message = pkt.diseases.add()
+            disease_message.term.id = disease_record.disease_term
+            if getattr(disease_record, "disease_label", None):
+                disease_message.term.label = disease_record.disease_label
+
+    @staticmethod
+    def _add_measurements_to_packet(pkt, measurements: list) -> None:
+        """
+        Add Measurement messages.
+        Keep assignments minimal due to differences across proto builds.
+        """
+        for measurement_record in measurements:
+            measurement_message = pkt.measurements.add()
+            measurement_message.type.id = measurement_record.measurement_type
+            # Value/unit/timestamp fields vary by build; intentionally minimal.
+
+    @staticmethod
+    def _add_biosamples_to_packet(pkt, biosamples: list) -> None:
+        """
+        Add Biosample messages with id and type.id.
+        """
+        for biosample_record in biosamples:
+            biosample_message = pkt.biosamples.add()
+            biosample_message.id = biosample_record.biosample_id
+            biosample_message.type.id = biosample_record.biosample_type
diff --git a/src/P6/measurement.py b/src/P6/measurement.py
new file mode 100644
index 0000000..18dfeaf
--- /dev/null
+++ b/src/P6/measurement.py
@@ -0,0 +1,27 @@
+"""
+Measurement domain model.
+
+Defines the MeasurementRecord dataclass for capturing quantitative measurements.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class MeasurementRecord:
+    """
+    Represents a measurement entry for a patient.
+
+    Attributes:
+        patient_ID: Unique alphanumeric patient identifier.
+        measurement_type: CURIE of measurement type (e.g. 'LOINC:4548-4').
+        measurement_value: Numeric value of the measurement.
+        measurement_unit: Unit CURIE or string (e.g. 'mmol/L').
+        measurement_timestamp: ISO timestamp string (e.g. '2025-07-15T14:23:00').
+    """
+
+    patient_ID: str
+    measurement_type: str
+    measurement_value: float
+    measurement_unit: str
+    measurement_timestamp: str
diff --git a/tests/conftest.py b/tests/conftest.py
index 3262d60..a2a2c67 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,22 +1,25 @@
+import hpotk
 import os
 import pytest
 
-import hpotk
-
 
 @pytest.fixture(scope="session")
 def fpath_test_dir() -> str:
     """
-    Path to `tests` folder.
+    Path to `tests/data/` folder.
     """
-    return os.path.dirname(os.path.abspath(__file__))
+    return os.path.join(os.path.dirname(__file__), "data")
 
 
 @pytest.fixture(scope="session")
 def fpath_hpo(fpath_test_dir: str) -> str:
-    return os.path.join(fpath_test_dir, "data", "hp.v2024-04-26.json.gz")
+    return os.path.join(fpath_test_dir, "hp.v2024-04-26.json.gz")
 
 
 @pytest.fixture(scope="session")
 def hpo(fpath_hpo: str) -> hpotk.MinimalOntology:
+    """
+    The PATH to a JSON file of HPO terms and IDs.
+    `hpotk` should be able to read this directly without manual decompression.
+    """
     return hpotk.load_minimal_ontology(fpath_hpo)
diff --git a/tests/test_cli_audit_excel.py b/tests/test_cli_audit_excel.py
new file mode 100644
index 0000000..f41e4b4
--- /dev/null
+++ b/tests/test_cli_audit_excel.py
@@ -0,0 +1,48 @@
+import glob
+import os
+import pytest
+import re
+from click.testing import CliRunner
+from P6.__main__ import main
+
+
+@pytest.fixture(scope="session", autouse=True)
+def verify_hpo_file():
+    # ensure HPO file is in place
+    data_dir = os.path.join(os.path.dirname(__file__), "data")
+    assert os.path.isdir(data_dir)
+
+
+@pytest.mark.parametrize(
+    "sample_xlsx", glob.glob(os.path.join(os.path.dirname(__file__), "data", "*.xlsx"))
+)
+def test_audit_excel_table_output(sample_xlsx):
+    runner = CliRunner()
+    result = runner.invoke(main, ["audit-excel", "-e", sample_xlsx])
+    assert result.exit_code == 0, result.output
+
+    # first line should be our header
+    first = result.output.splitlines()[0]
+    assert first.startswith("SHEET"), "Expected table header"
+    # each subsequent line should have 4 columns
+    for line in result.output.splitlines()[1:]:
+        parts = re.split(r"\s{2,}", line.strip())
+        assert len(parts) >= 4, f"Bad line in audit table: {line}"
+
+
+def test_audit_excel_json_output(tmp_path):
+    # pick any test workbook
+    sample = glob.glob(os.path.join(os.path.dirname(__file__), "data", "*.xlsx"))[0]
+    runner = CliRunner()
+    result = runner.invoke(main, ["audit-excel", "-e", sample, "-r"])
+    assert result.exit_code == 0, result.output
+
+    # JSON must parse to a list of dicts
+    import json
+
+    payload = json.loads(result.output)
+    assert isinstance(payload, list)
+    assert all(isinstance(obj, dict) for obj in payload)
+    # check expected keys
+    for obj in payload:
+        assert {"step", "sheet", "level", "message"}.issubset(obj.keys())
diff --git a/tests/test_download_mock.py b/tests/test_download_mock.py
new file mode 100644
index 0000000..4226e60
--- /dev/null
+++ b/tests/test_download_mock.py
@@ -0,0 +1,26 @@
+"""
+Isolated test for the 'download' command without hitting the network.
+
+We patch requests.get twice:
+- first for the 'latest release' lookup (returns {'tag_name': 'vX'})
+- second for the actual file download (returns the file content).
+"""
+
+from click.testing import CliRunner
+from unittest.mock import patch, Mock
+from P6.__main__ import main
+
+
+def test_download_mocks_network(tmp_path):
+    runner = CliRunner()
+
+    def fake_get(url, *args, **kwargs):
+        if url.endswith("/releases/latest"):
+            return Mock(status_code=200, json=lambda: {"tag_name": "vX"})
+        # second call returns the content of hp.json
+        return Mock(status_code=200, content=b"{}")
+
+    with patch("P6.__main__.requests.get", side_effect=fake_get):
+        res = runner.invoke(main, ["download", "-d", str(tmp_path)])
+        assert res.exit_code == 0
+        assert (tmp_path / "hp.json").exists()
diff --git a/tests/test_full_features.py b/tests/test_full_features.py
new file mode 100644
index 0000000..66e386f
--- /dev/null
+++ b/tests/test_full_features.py
@@ -0,0 +1,34 @@
+import os
+import re
+import glob
+import json
+from click.testing import CliRunner
+from P6.__main__ import main
+
+# Path to the “full” example workbook (e.g. created under tests/data/full_example.xlsx)
+FULL_XLSX = os.path.join(
+    os.path.dirname(__file__), "data", "Sydney_Python_transformation.xlsx"
+)
+
+
+def test_full_features_parse_creates_all_blocks(fpath_hpo):
+    runner = CliRunner()
+    result = runner.invoke(
+        main, ["parse-excel", "-e", FULL_XLSX, "--custom-hpo", fpath_hpo]
+    )
+    assert result.exit_code == 0, result.output
+
+    # Extract the output directory from the summary line
+    m = re.search(r"Wrote \d+ phenopacket files to (.+)", result.output)
+    assert m, "Did not find output directory in CLI summary"
+    outdir = m.group(1).strip()
+
+    # There should be at least one JSON file
+    json_files = glob.glob(os.path.join(outdir, "*.json"))
+    assert json_files, f"No JSON files written to {outdir}"
+
+    # Load the first phenopacket and check new keys
+    pkt = json.load(open(json_files[0], encoding="utf-8"))
+    assert isinstance(pkt.get("diseases", []), list), "Missing 'diseases' block"
+    assert isinstance(pkt.get("measurements", []), list), "Missing 'measurements' block"
+    assert isinstance(pkt.get("biosamples", []), list), "Missing 'biosamples' block"
diff --git a/tests/test_main_helpers.py b/tests/test_main_helpers.py
new file mode 100644
index 0000000..519798b
--- /dev/null
+++ b/tests/test_main_helpers.py
@@ -0,0 +1,36 @@
+"""
+Unit tests for small helpers in __main__.py:
+- _prepare_output_dir: creates timestamped folder
+- _report_issues: prints warnings/errors to stdout
+"""
+
+import re
+from P6.__main__ import _prepare_output_dir, _report_issues
+from stairval.notepad import create_notepad
+
+
+def test_prepare_output_dir_creates_timestamped_folder(tmp_path, monkeypatch):
+    """
+    The output path should look like:
+    <repository rootH>/phenopacket_from_excel/YYYY-MM-DD_HH-MM-SS/phenopackets
+    """
+    monkeypatch.chdir(tmp_path)
+    out = _prepare_output_dir()
+    assert out.exists() and out.is_dir()
+    assert re.search(r"\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}/phenopackets$", str(out))
+
+
+def test_report_issues_outputs_both_blocks(capsys):
+    """
+    When notepad contains both warnings and errors, the helper should print both sections.
+    """
+    n = create_notepad("report")
+    n.add_warning("warn 1")
+    n.add_error("err 1")
+
+    _report_issues(n)
+    out = capsys.readouterr().out
+    assert "Warnings found in mapping" in out
+    assert "warn 1" in out
+    assert "Errors found in mapping" in out
+    assert "err 1" in out
diff --git a/tests/test_mapper_apply_construct.py b/tests/test_mapper_apply_construct.py
new file mode 100644
index 0000000..be88506
--- /dev/null
+++ b/tests/test_mapper_apply_construct.py
@@ -0,0 +1,77 @@
+"""
+End-to-end unit test for DefaultMapper.apply_mapping and construct_phenopacket_for_patient.
+
+We build tiny genotype/phenotype DataFrames for a single patient and assert:
+- stats counters are set,
+- a Phenopacket is produced with expected minimal content.
+"""
+
+import pandas as pd
+import hpotk
+from stairval.notepad import create_notepad
+from P6.mapper import DefaultMapper
+
+HPO_PATH = "tests/data/hp.v2024-04-26.json.gz"
+
+
+def test_apply_mapping_builds_phenopackets_and_stats():
+    mapper = DefaultMapper(hpotk.load_minimal_ontology(HPO_PATH))
+    note = create_notepad("apply")
+
+    # DataFrames use the index as the patient id; mapper will bring it into a column.
+    geno = pd.DataFrame(
+        {
+            "contact_email": ["user@example.com"],
+            "phasing": [1],
+            "chromosome": ["chr16"],
+            "start_position": [100],
+            "end_position": [100],
+            "reference": ["A"],
+            "alternate": ["G"],
+            "gene_symbol": ["GENE1"],
+            "hgvsg": ["chr16:g.100A>G"],
+            "hgvsc": ["NM_000000.0:c.100A>G"],
+            "hgvsp": ["NP_000000.0:p.(Lys34Glu)"],
+            "zygosity": ["het"],
+            "inheritance": ["inherited"],
+        },
+        index=pd.Index(["P100"], name="patient"),
+    )
+
+    pheno = pd.DataFrame(
+        {
+            "hpo_id": ["HP:510"],  # will normalize to HP:0000510
+            "date_of_observation": [20200101],
+            "status": [1],
+        },
+        index=pd.Index(["P100"], name="patient"),
+    )
+
+    tables = {"genotype": geno, "phenotype": pheno}
+    packets = mapper.apply_mapping(tables, note)
+
+    # print("ERRS", list(note.errors()))
+    # print("WARNS", list(note.warnings()))
+
+    # Stats should reflect 1 genotype + 1 phenotype for 1 patient
+    assert mapper.stats["genotypes"] == 1
+    assert mapper.stats["phenotypes"] == 1
+    assert mapper.stats["patients"] == 1
+    assert len(packets) == 1
+
+    pkt = packets[0]
+    assert pkt.id == "P100"
+    assert pkt.subject.id == "P100"
+    # one phenotypic feature with normalized ID
+    assert pkt.phenotypic_features[0].type.id == "HP:0000510"
+    # one interpretation with an HGVS expression
+    assert (
+        pkt.interpretations[0]
+        .diagnosis.genomic_interpretations[0]
+        .variant_interpretation.variation_descriptor.expressions[0]
+        .value
+        == "16:g.100A>G"
+    )
+
+    # No mapping errors expected for this minimal happy path
+    assert not note.has_errors(include_subsections=True)
diff --git a/tests/test_mapper_check_hgvs.py b/tests/test_mapper_check_hgvs.py
new file mode 100644
index 0000000..b7f68f5
--- /dev/null
+++ b/tests/test_mapper_check_hgvs.py
@@ -0,0 +1,61 @@
+"""
+Tests for DefaultMapper.check_hgvs_consistency.
+
+We check:
+- matching with/without 'chr',
+- BED-like SNV convention (start=pos-1, end=pos),
+- error in strict mode when there's a mismatch.
+"""
+
+import pandas as pd
+import hpotk
+from stairval.notepad import create_notepad
+from P6.mapper import DefaultMapper
+
+HPO_PATH = "tests/data/hp.v2024-04-26.json.gz"
+
+
+def make_mapper(strict=False):
+    return DefaultMapper(hpotk.load_minimal_ontology(HPO_PATH), strict_variants=strict)
+
+
+def test_check_hgvs_consistency_ok_bed_like_and_chr_prefix():
+    """
+    start=99, end=100 vs HGVS pos=100 is acceptable.
+    Accept both 'chr1' and '1'.
+    """
+    # m = make_mapper()
+    note = create_notepad("genotype")
+    row = pd.Series(
+        {
+            "chromosome": "chr1",
+            "start_position": 99,
+            "end_position": 100,
+            "reference": "A",
+            "alternate": "G",
+            "hgvsg": "1:g.100A>G",
+        }
+    )
+    DefaultMapper.check_hgvs_consistency(row, "genotype", note, strict=False)
+    assert not note.has_errors(include_subsections=True)
+    assert not note.has_warnings(include_subsections=True)
+
+
+def test_check_hgvs_consistency_strict_errors_on_mismatch():
+    """
+    With strict_variants=True we should get an error when positions disagree.
+    """
+    # m = make_mapper(strict=True)
+    note = create_notepad("genotype")
+    row = pd.Series(
+        {
+            "chromosome": "1",
+            "start_position": 100,
+            "end_position": 100,
+            "reference": "A",
+            "alternate": "G",
+            "hgvsg": "1:g.101A>G",
+        }
+    )
+    DefaultMapper.check_hgvs_consistency(row, "genotype", note, strict=True)
+    assert note.has_errors(include_subsections=True)
diff --git a/tests/test_mapper_choose_aliases.py b/tests/test_mapper_choose_aliases.py
new file mode 100644
index 0000000..03c283c
--- /dev/null
+++ b/tests/test_mapper_choose_aliases.py
@@ -0,0 +1,23 @@
+"""
+Tests that the alias-based sheet selection works.
+
+We pass 'variants' (→ genotype), 'hpo' (→ phenotype), and 'labs' (→ measurements)
+and ensure they are recognized.
+"""
+
+import pandas as pd
+import hpotk
+from stairval.notepad import create_notepad
+from P6.mapper import DefaultMapper
+
+HPO_PATH = "tests/data/hp.v2024-04-26.json.gz"
+
+
+def test_choose_named_tables_aliases():
+    m = DefaultMapper(hpotk.load_minimal_ontology(HPO_PATH))
+    note = create_notepad("alias-test")
+    tables = {"variants": pd.DataFrame(), "hpo": pd.DataFrame(), "labs": pd.DataFrame()}
+    selected = m._choose_named_tables(tables, note)
+    assert selected.genotype is not None
+    assert selected.phenotype is not None
+    assert selected.measurements is not None
diff --git a/tests/test_mapper_map_tables_required_columns.py b/tests/test_mapper_map_tables_required_columns.py
new file mode 100644
index 0000000..0ee9384
--- /dev/null
+++ b/tests/test_mapper_map_tables_required_columns.py
@@ -0,0 +1,62 @@
+"""
+Ensure the table-level wrappers enforce required columns and report errors.
+
+Each *_table method should return [] and emit an error if required columns are missing.
+"""
+
+import pandas as pd
+import hpotk
+from stairval.notepad import create_notepad
+from P6.mapper import DefaultMapper
+
+HPO_PATH = "tests/data/hp.v2024-04-26.json.gz"
+
+
+def make_mapper():
+    return DefaultMapper(hpotk.load_minimal_ontology(HPO_PATH))
+
+
+def test_map_genotype_table_missing_required_columns_errors():
+    m = make_mapper()
+    note = create_notepad("genotype")
+    # Intentionally missing many required genotype columns
+    df = pd.DataFrame({"contact_email": ["a@b.com"]}).set_index(pd.Index(["P1"]))
+    records = m._map_genotype_table(df, note)
+    assert records == []
+    assert note.has_errors(include_subsections=True)
+
+
+def test_map_phenotype_table_missing_required_columns_errors():
+    m = make_mapper()
+    note = create_notepad("phenotype")
+    df = pd.DataFrame({"hpo_id": ["HP:1"]}).set_index(pd.Index(["P1"]))
+    records = m._map_phenotype_table(df, note)
+    assert records == []
+    assert note.has_errors(include_subsections=True)
+
+
+def test_map_diseases_table_missing_required_columns_errors():
+    m = make_mapper()
+    note = create_notepad("diseases")
+    df = pd.DataFrame({"patient_ID": ["P1"], "disease_term": ["MONDO:000"]})
+    records = m._map_diseases_table(df, note)
+    assert records == []
+    assert note.has_errors(include_subsections=True)
+
+
+def test_map_measurements_table_missing_required_columns_errors():
+    m = make_mapper()
+    note = create_notepad("measurements")
+    df = pd.DataFrame({"patient_ID": ["P1"], "measurement_type": ["X"]})
+    records = m._map_measurements_table(df, note)
+    assert records == []
+    assert note.has_errors(include_subsections=True)
+
+
+def test_map_biosamples_table_missing_required_columns_errors():
+    m = make_mapper()
+    note = create_notepad("biosamples")
+    df = pd.DataFrame({"patient_ID": ["P1"], "biosample_id": ["B1"]})
+    records = m._map_biosamples_table(df, note)
+    assert records == []
+    assert note.has_errors(include_subsections=True)
diff --git a/tests/test_mapper_parse_genotype_row.py b/tests/test_mapper_parse_genotype_row.py
new file mode 100644
index 0000000..be10c11
--- /dev/null
+++ b/tests/test_mapper_parse_genotype_row.py
@@ -0,0 +1,67 @@
+# tests/test_mapper_parse_genotype_row.py
+"""
+Unit tests for DefaultMapper.parse_genotype_row.
+
+These focus narrowly on:
+- tokenized zygosity/inheritance (e.g., "het/hom" + "inherited/denovo")
+- graceful handling of unknown codes (should emit an error and return [])
+- missing contact_email (should default to 'unknown@example.com')
+
+We don’t need the ontology here because genotype parsing doesn’t consult it.
+"""
+
+import pandas as pd
+
+from P6.mapper import DefaultMapper
+from stairval.notepad import create_notepad
+
+# Small helpers for clarity
+
+
+def make_mapper() -> DefaultMapper:
+    """Create a DefaultMapper with a dummy ontology (not used for genotype parsing)."""
+    # We can pass None for the ontology because parse_genotype_row doesn't access it.
+    return DefaultMapper(hpo=None, strict_variants=False)
+
+
+def base_row() -> dict:
+    """
+    A minimal, syntactically valid genotype row.
+    Individual tests override zygosity / inheritance (and other fields) as needed.
+    """
+    return {
+        "genotype_patient_ID": "P100",
+        "contact_email": "user@example.com",
+        "phasing": 1,
+        "chromosome": "16",
+        "start_position": 100,
+        "end_position": 100,
+        "reference": "A",
+        "alternate": "G",
+        "gene_symbol": "GENE1",
+        # Allow optional 'chr' prefix; mapper.check_hgvs_consistency tolerates it.
+        "hgvsg": "chr16:g.100A>G",
+        # Minimal but syntactically valid c./p. strings (keeps row construction happy).
+        "hgvsc": "NM_000000.0:c.100A>G",
+        "hgvsp": "NP_000000.0:p.(Lys34Glu)",
+        # Defaults (most tests override these)
+        "zygosity": "het",
+        "inheritance": "inherited",
+    }
+
+
+# The actual tests
+
+
+def test_parse_genotype_row_multi_tokens_creates_two_records():
+    """
+    zygosity 'het/hom' and inheritance 'inherited/denovo' should produce 2 Genotype objects,
+    pairing tokens positionally (het↔inherited, hom↔denovo).
+    """
+    m = make_mapper()
+    note = create_notepad("genotype")
+    row = pd.Series(
+        {**base_row(), "zygosity": "het/hom", "inheritance": "inherited/denovo"}
+    )
+
+    items, aux = m.parse_genotype_row(row, "genotype", note)
diff --git a/tests/test_mapper_parse_phenotype_row.py b/tests/test_mapper_parse_phenotype_row.py
new file mode 100644
index 0000000..9476c49
--- /dev/null
+++ b/tests/test_mapper_parse_phenotype_row.py
@@ -0,0 +1,82 @@
+"""
+Focused tests for DefaultMapper.parse_phenotype_row.
+
+These verify:
+- normal parsing of HPO ids (with/without labels),
+- handling of the "NAD" placeholder,
+- label/name mismatch warnings.
+"""
+
+import pandas as pd
+import hpotk
+from stairval.notepad import create_notepad
+from P6.mapper import DefaultMapper
+
+HPO_PATH = "tests/data/hp.v2024-04-26.json.gz"
+
+
+def make_mapper():
+    """Helper that builds a mapper with the test HPO graph."""
+    return DefaultMapper(hpotk.load_minimal_ontology(HPO_PATH))
+
+
+def test_parse_phenotype_row_ok_numeric_date_prefixed():
+    """
+    Numeric dates should be normalized to 'T<digits>'.
+    HPO ID should be zero-padded to 7 digits.
+    """
+    m = make_mapper()
+    note = create_notepad("phenotype")
+    row = pd.Series(
+        {
+            "phenotype_patient_ID": "P1",
+            "hpo_id": "HP:510",  # will become HP:0000510
+            "date_of_observation": 20200101,
+            "status": 1,  # True
+        }
+    )
+    items, ids = m.parse_phenotype_row(row, m._hpo, "phenotype", note)
+    assert len(items) == 1
+    assert items[0].HPO_ID == "HP:0000510"
+    assert items[0].date_of_observation == "T20200101"
+    assert not note.has_errors(include_subsections=True)
+
+
+def test_parse_phenotype_row_nad_skips_with_warning():
+    """
+    'NAD' indicates 'No Abnormality Detected' and should be skipped with a warning,
+    not treated as an error.
+    """
+    m = make_mapper()
+    note = create_notepad("phenotype")
+    row = pd.Series(
+        {
+            "phenotype_patient_ID": "P1",
+            "hpo_id": "NAD",
+            "date_of_observation": "2020",
+            "status": 0,
+        }
+    )
+    items, ids = m.parse_phenotype_row(row, m._hpo, "phenotype", note)
+    assert items == []
+    assert note.has_warnings(include_subsections=True)
+    assert not note.has_errors(include_subsections=True)
+
+
+def test_parse_phenotype_row_label_mismatch_emits_warning():
+    """
+    If a user supplies a label that doesn't match the ontology label,
+    we flag a warning but still keep the row.
+    """
+    m = make_mapper()
+    note = create_notepad("phenotype")
+    row = pd.Series(
+        {
+            "phenotype_patient_ID": "P1",
+            "hpo_id": "Schizophrenia (HP:510)",  # label won't match real HP:0000510 name
+            "date_of_observation": "T2020",
+            "status": 1,
+        }
+    )
+    m.parse_phenotype_row(row, m._hpo, "phenotype", note)
+    assert note.has_warnings(include_subsections=True)
diff --git a/tests/test_mapper_utils.py b/tests/test_mapper_utils.py
new file mode 100644
index 0000000..597f176
--- /dev/null
+++ b/tests/test_mapper_utils.py
@@ -0,0 +1,24 @@
+"""
+Small utility tests for mapper statics:
+- _normalize_time_like
+- _to_bool
+"""
+
+from P6.mapper import DefaultMapper
+
+
+def test_normalize_time_like_variants():
+    n = DefaultMapper._normalize_time_like
+    assert n(20200101) == "T20200101"
+    assert n("T2020") == "T2020"
+    assert n(" 2020 ") == "T2020"
+    assert n("") == ""
+    assert n(None) == ""
+
+
+def test_to_bool_truth_table():
+    b = DefaultMapper._to_bool
+    for t in [1, "1", "true", "TRUE", "Yes", "y", True]:
+        assert b(t) is True
+    for f in [0, "0", "false", "no", "", None, False]:
+        assert b(f) is False
diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
new file mode 100644
index 0000000..e8d75c4
--- /dev/null
+++ b/tests/test_preprocess.py
@@ -0,0 +1,40 @@
+import pytest
+import pandas as pd
+from P6.__main__ import preprocess
+from P6.loader import load_sheets_as_tables
+
+
+@pytest.fixture
+def simple_workbook(tmp_path):
+    # build a tiny Excel with one sheet and minimal columns
+    df = pd.DataFrame(
+        {
+            "contact_email": ["a@b.com"],
+            "phasing": [True],
+            "chromosome": ["chr1"],
+            "start_position": [100],
+            "end_position": [100],
+            "reference": ["A"],
+            "alternate": ["T"],
+            "gene_symbol": ["GENE"],
+            "hgvsg": ["g.100A>T"],
+            "hgvsc": [""],
+            "hgvsp": [""],
+            "zygosity": ["het"],
+            "inheritance": ["unknown"],
+        },
+        index=["PAT1"],
+    )
+    path = tmp_path / "wb.xlsx"
+    with pd.ExcelWriter(path, engine="openpyxl") as w:
+        df.to_excel(w, sheet_name="geno")
+    return str(path)
+
+
+def test_preprocess_detects_variant_sheet(simple_workbook):
+    tables = load_sheets_as_tables(simple_workbook)
+    entries = preprocess(tables)
+    # we expect at least one classify-sheet entry
+    assert any(e.step == "classify-sheet" and e.sheet == "geno" for e in entries)
+    # and no variant-check errors since this is valid
+    assert not any(e.step == "variant-check" and e.level == "error" for e in entries)