Emit canonical-DD TSV from parsed dbGaP digests

amc-corey-cox · amc-corey-cox · commit 68731b04340f · 2026-05-05T12:51:13.000-05:00
Adds a `parse-digests` CLI command that reads cached data_dict.xml files for a cohort and writes one TSV per data table in the schema-automator canonical data dictionary format (linkml/schema-automator#201). Outputs land at `output/<cohort>/dd/<phs>.<pht>.dd.tsv` with all Spec A columns plus `uri` from Spec B. dbGaP types are translated to the canonical 10-value vocabulary; encoded values are rendered REDCap-style (`code, label | code, label`); each variable's `uri` carries the dbGaP phv accession as a CURIE for traceability. `unit`, `min`, and `max` are emitted empty pending richer var_report parsing. Refs #204
diff --git a/src/dm_bip/cli.py b/src/dm_bip/cli.py
@@ -140,5 +140,24 @@ def fetch_digests(
     )
 
 
+@app.command()
+def parse_digests(
+    cohort_key: Annotated[str, typer.Argument(help="Cohort key (e.g. jhs, aric)")],
+    cache_dir: Annotated[Path, typer.Option("--cache-dir", help="dbGaP digest cache (input)")] = Path(".dbgap-cache"),
+    output_dir: Annotated[Path, typer.Option("--output-dir", "-o", help="Output root directory")] = Path("output"),
+    refresh_cohorts: Annotated[bool, typer.Option("--refresh-cohorts", help="Re-fetch cohorts.yaml")] = False,
+):
+    """Convert cached dbGaP digests for a cohort into schema-automator canonical-DD TSVs."""
+    from dm_bip.prepare_study.fetch_digests import load_cohorts, parse_cached_digests
+
+    cohorts = load_cohorts(cache_dir=cache_dir, refresh=refresh_cohorts)
+    if cohort_key not in cohorts:
+        typer.echo(f"Unknown cohort '{cohort_key}'. Available: {', '.join(sorted(cohorts))}")
+        raise typer.Exit(code=2)
+
+    written = parse_cached_digests(cohorts[cohort_key], cache_root=cache_dir, output_root=output_dir)
+    typer.echo(f"Wrote {len(written)} data dictionaries under {output_dir / cohort_key / 'dd'}")
+
+
 if __name__ == "__main__":
     app()
diff --git a/src/dm_bip/prepare_study/fetch_digests.py b/src/dm_bip/prepare_study/fetch_digests.py
@@ -284,3 +284,93 @@ def fetch_digests(
     result.data_dicts.sort()
     result.var_reports.sort()
     return result
+
+
+# --- Canonical data-dictionary TSV output ------------------------------------
+
+DD_TSV_COLUMNS = ("name", "type", "description", "codes", "unit", "min", "max", "uri")
+
+_DBGAP_TYPE_MAP = {
+    "string": "string",
+    "integer": "integer",
+    "decimal": "decimal",
+    "encoded value": "permissible_values",
+    "date": "date",
+    "datetime": "datetime",
+    "time": "time",
+    "boolean": "boolean",
+}
+
+
+def _translate_type(dbgap_type: str | None) -> str:
+    """Translate a dbGaP <type> value to the schema-automator canonical type vocabulary."""
+    if not dbgap_type:
+        return "string"
+    canonical = _DBGAP_TYPE_MAP.get(dbgap_type.strip().lower())
+    if canonical is None:
+        logger.debug("Unknown dbGaP type %r; defaulting to 'string'", dbgap_type)
+        return "string"
+    return canonical
+
+
+def _sanitize_label(label: str) -> str:
+    """Replace characters that conflict with the codes-encoding separators."""
+    return label.replace("\t", " ").replace("|", "/").replace(",", ";")
+
+
+def _encode_codes(values: list[DigestValue]) -> str:
+    """Render encoded values as REDCap-style 'code, label | code, label | ...' string."""
+    return " | ".join(f"{v.code}, {_sanitize_label(v.label)}" for v in values)
+
+
+def _dd_row(variable: DigestVariable) -> dict[str, str]:
+    """Build one canonical-DD row from a parsed dbGaP variable."""
+    return {
+        "name": variable.name,
+        "type": _translate_type(variable.type),
+        "description": variable.description or "",
+        "codes": _encode_codes(variable.values),
+        "unit": "",
+        "min": "",
+        "max": "",
+        "uri": f"dbgap:{variable.id}" if variable.id else "",
+    }
+
+
+def write_canonical_dd(dd: DataDictionary, output_path: Path) -> Path:
+    """Write a parsed DataDictionary to a TSV file in the schema-automator canonical format."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("w", encoding="utf-8") as f:
+        f.write("\t".join(DD_TSV_COLUMNS) + "\n")
+        for var in dd.variables:
+            row = _dd_row(var)
+            f.write("\t".join(row[col] for col in DD_TSV_COLUMNS) + "\n")
+    logger.info("Wrote %s (%d variables)", output_path, len(dd.variables))
+    return output_path
+
+
+def _dd_output_filename(dd: DataDictionary) -> str:
+    """Build a stable output filename: <phs>.<pht>.dd.tsv."""
+    phs = dd.study_id.split(".")[0] if dd.study_id else "unknown_phs"
+    pht = dd.data_table_id.split(".")[0] if dd.data_table_id else "unknown_pht"
+    return f"{phs}.{pht}.dd.tsv"
+
+
+def parse_cached_digests(
+    cohort: Cohort,
+    cache_root: Path = DEFAULT_CACHE_DIR,
+    output_root: Path = Path("output"),
+) -> list[Path]:
+    """Convert all cached data_dict.xml files for a cohort into canonical-DD TSVs (offline)."""
+    cache_dir = _study_cache_path(cache_root, cohort)
+    if not cache_dir.exists():
+        raise FileNotFoundError(f"No cached digests at {cache_dir} (run fetch-digests first)")
+
+    out_dir = output_root / cohort.key / "dd"
+    written = []
+    for dd_path in sorted(cache_dir.glob("*.data_dict.xml")):
+        dd = parse_data_dict(dd_path)
+        out_path = out_dir / _dd_output_filename(dd)
+        write_canonical_dd(dd, out_path)
+        written.append(out_path)
+    return written
diff --git a/tests/unit/test_fetch_digests.py b/tests/unit/test_fetch_digests.py
@@ -3,9 +3,11 @@
 from pathlib import Path
 
 from dm_bip.prepare_study.fetch_digests import (
+    DD_TSV_COLUMNS,
     DigestValue,
     parse_data_dict,
     parse_var_report,
+    write_canonical_dd,
 )
 
 FIXTURES = Path(__file__).parent.parent / "input" / "dbgap_digests"
@@ -50,3 +52,29 @@ def test_parse_var_report_jhs_subject():
     assert total_row.stats is not None
     assert total_row.stats.n == 5885
     assert total_row.stats.nulls == 0
+
+
+def test_write_canonical_dd_jhs_subject(tmp_path):
+    """Convert the JHS Subject data_dict to canonical TSV; check shape, types, codes."""
+    dd = parse_data_dict(FIXTURES / "JHS_Subject.data_dict.xml")
+    out_path = tmp_path / "jhs.subject.dd.tsv"
+    write_canonical_dd(dd, out_path)
+
+    lines = out_path.read_text(encoding="utf-8").strip().split("\n")
+    header = lines[0].split("\t")
+    assert tuple(header) == DD_TSV_COLUMNS
+
+    rows = [dict(zip(header, line.split("\t"), strict=False)) for line in lines[1:]]
+    by_name = {r["name"]: r for r in rows}
+
+    subject = by_name["SUBJECT_ID"]
+    assert subject["type"] == "string"
+    assert subject["description"] == "Subject ID"
+    assert subject["codes"] == ""
+    assert subject["uri"].startswith("dbgap:phv")
+
+    consent = by_name["CONSENT"]
+    assert consent["type"] == "permissible_values"
+    assert consent["codes"].startswith("0, ")
+    assert " | 1, " in consent["codes"]
+    assert "," not in consent["codes"].split("|")[0].split(",", 1)[1]  # comma-in-label sanitized to ;