dibbs-ecr-refiner/refiner/scripts/exports/export_groupers.py at aa463cb59340ae1fede0bb57eef3e8299b2f05ec · CDCgov/dibbs-ecr-refiner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import csv
import json
from datetime import datetime
from pathlib import Path

EXPORTS_DIR = Path(__file__).parent
SCRIPTS_DIR = EXPORTS_DIR.parent
TES_DATA_DIR = SCRIPTS_DIR / "data" / "tes"
OUTFILE = (
    EXPORTS_DIR / f"tes-export-groupers-{datetime.today().strftime('%Y-%m-%d')}.csv"
)


def parse_snomed_from_url(url: str) -> str | None:
    """
    Extract the RSG SNOMED CT code from its canonical url.
    """

    if "rs-grouper-" in url:
        return url.split("rs-grouper-")[-1]
    return None


def main():
    """
    Run the script to export CSV file to show relationship between CGs <-> RSGs.
    """

    print("🌱 Starting grouper CSV export...")

    # load all ValueSets, keyed by (url, version)
    all_valuesets = {}
    json_files = [f for f in TES_DATA_DIR.glob("*.json") if f.name != "manifest.json"]
    print(f"🔎 Found {len(json_files)} JSON file(s) in {TES_DATA_DIR}")
    for file_path in json_files:
        print(f"📖 Reading {file_path.name}...")
        try:
            with open(file_path) as f:
                doc = json.load(f)
                for vs in doc.get("valuesets", []):
                    key = (vs.get("url"), vs.get("version"))
                    all_valuesets[key] = vs
        except Exception as e:
            print(f"⚠️ Failed to read {file_path.name}: {e}")

    rows = []
    parent_count = 0
    relation_count = 0

    # iterate parent ValueSets that reference child ValueSets
    for parent in all_valuesets.values():
        includes = parent.get("compose", {}).get("include", [])
        has_children = any("valueSet" in inc for inc in includes)
        if not has_children:
            continue
        parent_count += 1
        condition_grouper_name = parent.get("name") or parent.get("title")
        condition_grouper_canonical_url = parent.get("url")
        condition_grouper_version = parent.get("version")
        for include in includes:
            for child_ref in include.get("valueSet", []):
                try:
                    (
                        reporting_spec_grouper_canonical_url,
                        reporting_spec_grouper_version,
                    ) = child_ref.split("|", 1)
                except ValueError:
                    print(f"⚠️ Skipping malformed child reference: {child_ref}")
                    continue
                child_vs = all_valuesets.get(
                    (
                        reporting_spec_grouper_canonical_url,
                        reporting_spec_grouper_version,
                    )
                )
                if not child_vs:
                    print(
                        f"⚠️ Could not find child ValueSet: {reporting_spec_grouper_canonical_url}|{reporting_spec_grouper_version}"
                    )
                    continue
                reporting_spec_grouper_snomed = parse_snomed_from_url(
                    child_vs.get("url", "")
                )
                if not reporting_spec_grouper_snomed:
                    # Skip non-reporting-specification groupers (like additional context groupers)
                    continue
                reporting_spec_grouper_name = child_vs.get("title")
                rows.append(
                    {
                        "condition_grouper_name": condition_grouper_name,
                        "condition_grouper_canonical_url": condition_grouper_canonical_url,
                        "condition_grouper_version": condition_grouper_version,
                        "reporting_spec_grouper_snomed": reporting_spec_grouper_snomed,
                        "reporting_spec_grouper_name": reporting_spec_grouper_name,
                        "reporting_spec_grouper_canonical_url": reporting_spec_grouper_canonical_url,
                        "reporting_spec_grouper_version": reporting_spec_grouper_version,
                    }
                )
                relation_count += 1

    print(
        f"🧩 Processed {parent_count} parent groupers with {relation_count} parent-child relationships."
    )

    # write to csv
    OUTFILE.parent.mkdir(parents=True, exist_ok=True)
    try:
        with open(OUTFILE, "w", newline="", encoding="utf-8") as csvfile:
            fieldnames = [
                "condition_grouper_name",
                "condition_grouper_canonical_url",
                "condition_grouper_version",
                "reporting_spec_grouper_snomed",
                "reporting_spec_grouper_name",
                "reporting_spec_grouper_canonical_url",
                "reporting_spec_grouper_version",
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in rows:
                writer.writerow(row)
        print(f"✅ Wrote {len(rows)} rows to {OUTFILE}")
        print("🎉 Grouper CSV export complete!")
    except Exception as e:
        print(f"❌ Error writing CSV: {e}")


if __name__ == "__main__":
    main()