Skip to content

Commit 68731b0

Browse files
committed
Emit canonical-DD TSV from parsed dbGaP digests
Adds a `parse-digests` CLI command that reads cached data_dict.xml files for a cohort and writes one TSV per data table in the schema-automator canonical data dictionary format (linkml/schema-automator#201). Outputs land at `output/<cohort>/dd/<phs>.<pht>.dd.tsv` with all Spec A columns plus `uri` from Spec B. dbGaP types are translated to the canonical 10-value vocabulary; encoded values are rendered REDCap-style (`code, label | code, label`); each variable's `uri` carries the dbGaP phv accession as a CURIE for traceability. `unit`, `min`, and `max` are emitted empty pending richer var_report parsing. Refs #204
1 parent ccfd4b9 commit 68731b0

3 files changed

Lines changed: 137 additions & 0 deletions

File tree

src/dm_bip/cli.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,5 +140,24 @@ def fetch_digests(
140140
)
141141

142142

143+
@app.command()
144+
def parse_digests(
145+
cohort_key: Annotated[str, typer.Argument(help="Cohort key (e.g. jhs, aric)")],
146+
cache_dir: Annotated[Path, typer.Option("--cache-dir", help="dbGaP digest cache (input)")] = Path(".dbgap-cache"),
147+
output_dir: Annotated[Path, typer.Option("--output-dir", "-o", help="Output root directory")] = Path("output"),
148+
refresh_cohorts: Annotated[bool, typer.Option("--refresh-cohorts", help="Re-fetch cohorts.yaml")] = False,
149+
):
150+
"""Convert cached dbGaP digests for a cohort into schema-automator canonical-DD TSVs."""
151+
from dm_bip.prepare_study.fetch_digests import load_cohorts, parse_cached_digests
152+
153+
cohorts = load_cohorts(cache_dir=cache_dir, refresh=refresh_cohorts)
154+
if cohort_key not in cohorts:
155+
typer.echo(f"Unknown cohort '{cohort_key}'. Available: {', '.join(sorted(cohorts))}")
156+
raise typer.Exit(code=2)
157+
158+
written = parse_cached_digests(cohorts[cohort_key], cache_root=cache_dir, output_root=output_dir)
159+
typer.echo(f"Wrote {len(written)} data dictionaries under {output_dir / cohort_key / 'dd'}")
160+
161+
143162
if __name__ == "__main__":
144163
app()

src/dm_bip/prepare_study/fetch_digests.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,3 +284,93 @@ def fetch_digests(
284284
result.data_dicts.sort()
285285
result.var_reports.sort()
286286
return result
287+
288+
289+
# --- Canonical data-dictionary TSV output ------------------------------------
290+
291+
DD_TSV_COLUMNS = ("name", "type", "description", "codes", "unit", "min", "max", "uri")
292+
293+
_DBGAP_TYPE_MAP = {
294+
"string": "string",
295+
"integer": "integer",
296+
"decimal": "decimal",
297+
"encoded value": "permissible_values",
298+
"date": "date",
299+
"datetime": "datetime",
300+
"time": "time",
301+
"boolean": "boolean",
302+
}
303+
304+
305+
def _translate_type(dbgap_type: str | None) -> str:
306+
"""Translate a dbGaP <type> value to the schema-automator canonical type vocabulary."""
307+
if not dbgap_type:
308+
return "string"
309+
canonical = _DBGAP_TYPE_MAP.get(dbgap_type.strip().lower())
310+
if canonical is None:
311+
logger.debug("Unknown dbGaP type %r; defaulting to 'string'", dbgap_type)
312+
return "string"
313+
return canonical
314+
315+
316+
def _sanitize_label(label: str) -> str:
317+
"""Replace characters that conflict with the codes-encoding separators."""
318+
return label.replace("\t", " ").replace("|", "/").replace(",", ";")
319+
320+
321+
def _encode_codes(values: list[DigestValue]) -> str:
322+
"""Render encoded values as REDCap-style 'code, label | code, label | ...' string."""
323+
return " | ".join(f"{v.code}, {_sanitize_label(v.label)}" for v in values)
324+
325+
326+
def _dd_row(variable: DigestVariable) -> dict[str, str]:
327+
"""Build one canonical-DD row from a parsed dbGaP variable."""
328+
return {
329+
"name": variable.name,
330+
"type": _translate_type(variable.type),
331+
"description": variable.description or "",
332+
"codes": _encode_codes(variable.values),
333+
"unit": "",
334+
"min": "",
335+
"max": "",
336+
"uri": f"dbgap:{variable.id}" if variable.id else "",
337+
}
338+
339+
340+
def write_canonical_dd(dd: DataDictionary, output_path: Path) -> Path:
341+
"""Write a parsed DataDictionary to a TSV file in the schema-automator canonical format."""
342+
output_path.parent.mkdir(parents=True, exist_ok=True)
343+
with output_path.open("w", encoding="utf-8") as f:
344+
f.write("\t".join(DD_TSV_COLUMNS) + "\n")
345+
for var in dd.variables:
346+
row = _dd_row(var)
347+
f.write("\t".join(row[col] for col in DD_TSV_COLUMNS) + "\n")
348+
logger.info("Wrote %s (%d variables)", output_path, len(dd.variables))
349+
return output_path
350+
351+
352+
def _dd_output_filename(dd: DataDictionary) -> str:
353+
"""Build a stable output filename: <phs>.<pht>.dd.tsv."""
354+
phs = dd.study_id.split(".")[0] if dd.study_id else "unknown_phs"
355+
pht = dd.data_table_id.split(".")[0] if dd.data_table_id else "unknown_pht"
356+
return f"{phs}.{pht}.dd.tsv"
357+
358+
359+
def parse_cached_digests(
360+
cohort: Cohort,
361+
cache_root: Path = DEFAULT_CACHE_DIR,
362+
output_root: Path = Path("output"),
363+
) -> list[Path]:
364+
"""Convert all cached data_dict.xml files for a cohort into canonical-DD TSVs (offline)."""
365+
cache_dir = _study_cache_path(cache_root, cohort)
366+
if not cache_dir.exists():
367+
raise FileNotFoundError(f"No cached digests at {cache_dir} (run fetch-digests first)")
368+
369+
out_dir = output_root / cohort.key / "dd"
370+
written = []
371+
for dd_path in sorted(cache_dir.glob("*.data_dict.xml")):
372+
dd = parse_data_dict(dd_path)
373+
out_path = out_dir / _dd_output_filename(dd)
374+
write_canonical_dd(dd, out_path)
375+
written.append(out_path)
376+
return written

tests/unit/test_fetch_digests.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
from pathlib import Path
44

55
from dm_bip.prepare_study.fetch_digests import (
6+
DD_TSV_COLUMNS,
67
DigestValue,
78
parse_data_dict,
89
parse_var_report,
10+
write_canonical_dd,
911
)
1012

1113
FIXTURES = Path(__file__).parent.parent / "input" / "dbgap_digests"
@@ -50,3 +52,29 @@ def test_parse_var_report_jhs_subject():
5052
assert total_row.stats is not None
5153
assert total_row.stats.n == 5885
5254
assert total_row.stats.nulls == 0
55+
56+
57+
def test_write_canonical_dd_jhs_subject(tmp_path):
58+
"""Convert the JHS Subject data_dict to canonical TSV; check shape, types, codes."""
59+
dd = parse_data_dict(FIXTURES / "JHS_Subject.data_dict.xml")
60+
out_path = tmp_path / "jhs.subject.dd.tsv"
61+
write_canonical_dd(dd, out_path)
62+
63+
lines = out_path.read_text(encoding="utf-8").strip().split("\n")
64+
header = lines[0].split("\t")
65+
assert tuple(header) == DD_TSV_COLUMNS
66+
67+
rows = [dict(zip(header, line.split("\t"), strict=False)) for line in lines[1:]]
68+
by_name = {r["name"]: r for r in rows}
69+
70+
subject = by_name["SUBJECT_ID"]
71+
assert subject["type"] == "string"
72+
assert subject["description"] == "Subject ID"
73+
assert subject["codes"] == ""
74+
assert subject["uri"].startswith("dbgap:phv")
75+
76+
consent = by_name["CONSENT"]
77+
assert consent["type"] == "permissible_values"
78+
assert consent["codes"].startswith("0, ")
79+
assert " | 1, " in consent["codes"]
80+
assert "," not in consent["codes"].split("|")[0].split(",", 1)[1] # comma-in-label sanitized to ;

0 commit comments

Comments
 (0)