Skip to content

Commit 3920e5d

Browse files
Merge pull request #551 from pardeep-singh/pardeep/issue317-cardiology-domain
feat: add cardiology domain to NER catalog and zero-shot label maps
2 parents f05b239 + 7aa8d75 commit 3920e5d

3 files changed

Lines changed: 150 additions & 44 deletions

File tree

openmed/core/model_registry.py

Lines changed: 74 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,16 @@ def size_mb(self) -> Optional[int]:
173173
"Protein": ["GENE_OR_GENE_PRODUCT", "PROTEIN"],
174174
"Pathology": ["DISEASE", "PATHOLOGY"],
175175
"Hematology": ["CANCER", "DISEASE"],
176+
# Forward metadata for future Cardiology models; no Cardiology model is
177+
# registered today (see issue #317).
178+
"Cardiology": [
179+
"CARDIAC_FINDING",
180+
"ECG_FINDING",
181+
"EJECTION_FRACTION",
182+
"CARDIAC_PROCEDURE",
183+
"CARDIAC_DEVICE",
184+
"ANATOMY",
185+
],
176186
"Privacy": _PII_ENTITY_TYPES,
177187
}
178188

@@ -617,53 +627,74 @@ def get_all_models() -> Dict[str, ModelInfo]:
617627
return OPENMED_MODELS.copy()
618628

619629

630+
_CATEGORY_KEYWORDS: Dict[str, Tuple[str, str]] = {
631+
"pii|deidentif|hipaa|phi|protected health|patient name|ssn|medical record|privacy|anonymiz": (
632+
"Privacy",
633+
"Contains PII/de-identification terms",
634+
),
635+
"cancer|tumor|oncolog|malign|chemotherapy|radiation": (
636+
"Oncology",
637+
"Contains cancer/oncology terms",
638+
),
639+
"drug|medication|pharma|dose|mg|pill|tablet|cisplatin": (
640+
"Pharmaceutical",
641+
"Contains pharmaceutical terms",
642+
),
643+
"gene|dna|protein|mutation|chromosome": (
644+
"Genomics",
645+
"Contains genomic/genetic terms",
646+
),
647+
"ecg|ekg|ejection fraction|arrhythmia|stent|pacemaker|murmur|st elevation|echocardiogram|cardiac|cardiolog": (
648+
"Cardiology",
649+
"Contains cardiology terms",
650+
),
651+
"heart|lung|brain|liver|kidney|organ": (
652+
"Anatomy",
653+
"Contains anatomical terms",
654+
),
655+
"bacteria|virus|organism|species": (
656+
"Species",
657+
"Contains organism/species terms",
658+
),
659+
"disease|condition|disorder|syndrome": (
660+
"Disease",
661+
"Contains disease/condition terms",
662+
),
663+
"pathology|histology|biopsy": (
664+
"Pathology",
665+
"Contains pathological terms",
666+
),
667+
"blood|lymph|leukemia|lymphoma": (
668+
"Hematology",
669+
"Contains hematological terms",
670+
),
671+
}
672+
673+
674+
def _match_categories(text: str) -> List[Tuple[str, str]]:
675+
"""Return ``(category, reason)`` pairs whose keywords match ``text``.
676+
677+
This is the routing layer behind :func:`get_model_suggestions`. It reports
678+
a category whenever the text matches its keywords, independently of whether
679+
any model is registered for that category (e.g. ``Cardiology`` has keyword
680+
routing but no registered model yet).
681+
"""
682+
683+
text_lower = text.lower()
684+
return [
685+
(category, reason)
686+
for pattern, (category, reason) in _CATEGORY_KEYWORDS.items()
687+
if re.search(pattern, text_lower)
688+
]
689+
690+
620691
def get_model_suggestions(text: str) -> List[Tuple[str, ModelInfo, str]]:
621692
"""Suggest appropriate models based on text content."""
622-
text_lower = text.lower()
623693
suggestions: List[Tuple[str, ModelInfo, str]] = []
624-
keywords = {
625-
"pii|deidentif|hipaa|phi|protected health|patient name|ssn|medical record|privacy|anonymiz": (
626-
"Privacy",
627-
"Contains PII/de-identification terms",
628-
),
629-
"cancer|tumor|oncolog|malign|chemotherapy|radiation": (
630-
"Oncology",
631-
"Contains cancer/oncology terms",
632-
),
633-
"drug|medication|pharma|dose|mg|pill|tablet|cisplatin": (
634-
"Pharmaceutical",
635-
"Contains pharmaceutical terms",
636-
),
637-
"gene|dna|protein|mutation|chromosome": (
638-
"Genomics",
639-
"Contains genomic/genetic terms",
640-
),
641-
"heart|lung|brain|liver|kidney|organ": (
642-
"Anatomy",
643-
"Contains anatomical terms",
644-
),
645-
"bacteria|virus|organism|species": (
646-
"Species",
647-
"Contains organism/species terms",
648-
),
649-
"disease|condition|disorder|syndrome": (
650-
"Disease",
651-
"Contains disease/condition terms",
652-
),
653-
"pathology|histology|biopsy": (
654-
"Pathology",
655-
"Contains pathological terms",
656-
),
657-
"blood|lymph|leukemia|lymphoma": (
658-
"Hematology",
659-
"Contains hematological terms",
660-
),
661-
}
662694

663-
for pattern, (category, reason) in keywords.items():
664-
if re.search(pattern, text_lower):
665-
for key in CATEGORIES.get(category, [])[:3]:
666-
suggestions.append((key, OPENMED_MODELS[key], reason))
695+
for category, reason in _match_categories(text):
696+
for key in CATEGORIES.get(category, [])[:3]:
697+
suggestions.append((key, OPENMED_MODELS[key], reason))
667698

668699
if not suggestions:
669700
for key in SIZE_RECOMMENDATIONS.get("balanced", [])[:3]:

openmed/zero_shot/data/label_maps/defaults.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@
1212
"education": ["Course", "Topic", "Institution", "Degree"],
1313
"social": ["Person", "Handle", "Hashtag", "URL"],
1414
"public_health": ["Condition", "Intervention", "Outcome", "Population"],
15+
"cardiology": ["CardiacFinding", "ECGFinding", "EjectionFraction", "CardiacProcedure", "CardiacDevice", "Anatomy"],
1516
"generic": ["Person", "Organization", "Location", "Date"]
1617
}

tests/unit/ner/test_label_map_consistency.py

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,19 @@
77
from __future__ import annotations
88

99
import json
10+
import re
1011
from pathlib import Path
1112

1213
import pytest
1314

14-
from openmed.core.model_registry import OPENMED_MODELS
15+
from openmed.core.model_registry import (
16+
_CATEGORY_ENTITY_TYPES,
17+
OPENMED_MODELS,
18+
_match_categories,
19+
get_model_suggestions,
20+
)
1521
from openmed.core.pii_entity_merger import is_more_specific, normalize_label
22+
from openmed.ner.labels import available_domains, get_default_labels
1623

1724
# ---------------------------------------------------------------------------
1825
# Fixtures
@@ -59,6 +66,73 @@ def test_generic_domain_exists(self, label_maps):
5966
def test_generic_domain_has_labels(self, label_maps):
6067
assert len(label_maps["generic"]) >= 1
6168

69+
def test_labels_follow_consistent_style(self, label_maps):
70+
"""Every domain label is a single letters-only token (no spaces/digits)."""
71+
for domain, labels in label_maps.items():
72+
for label in labels:
73+
assert re.fullmatch(r"[A-Za-z]+", label), (
74+
f"Domain {domain!r} label {label!r} drifts from the "
75+
"letters-only display-label style"
76+
)
77+
78+
79+
# ---------------------------------------------------------------------------
80+
# Cardiology domain (issue #317)
81+
# ---------------------------------------------------------------------------
82+
83+
84+
class TestCardiologyDomain:
85+
EXPECTED_LABELS = [
86+
"CardiacFinding",
87+
"ECGFinding",
88+
"EjectionFraction",
89+
"CardiacProcedure",
90+
"CardiacDevice",
91+
"Anatomy",
92+
]
93+
94+
def test_cardiology_in_available_domains(self):
95+
assert "cardiology" in available_domains()
96+
97+
def test_get_default_labels_returns_cardiology_set(self):
98+
labels = get_default_labels("cardiology")
99+
assert labels # non-empty
100+
assert labels == self.EXPECTED_LABELS
101+
102+
def test_cardiology_labels_have_no_duplicates(self):
103+
labels = get_default_labels("cardiology")
104+
lowered = [label.lower() for label in labels]
105+
assert len(lowered) == len(set(lowered))
106+
107+
108+
# ---------------------------------------------------------------------------
109+
# Cardiology routing in model_registry (issue #317)
110+
# ---------------------------------------------------------------------------
111+
112+
113+
class TestCardiologyRouting:
114+
CARDIO_TEXT = "Echocardiogram shows reduced ejection fraction of 35%"
115+
116+
def test_match_categories_routes_cardiology(self):
117+
categories = [
118+
category for category, _reason in _match_categories(self.CARDIO_TEXT)
119+
]
120+
assert "Cardiology" in categories
121+
122+
def test_cardiology_is_registry_metadata_not_a_live_category(self):
123+
# Forward metadata for future models; no Cardiology model exists today.
124+
assert "Cardiology" in _CATEGORY_ENTITY_TYPES
125+
from openmed.core.model_registry import CATEGORIES
126+
127+
assert "Cardiology" not in CATEGORIES
128+
129+
def test_get_model_suggestions_behavior_unchanged_for_cardiology(self):
130+
# With no Cardiology model registered, suggestions fall back to general
131+
# medical models rather than surfacing unrelated cardiology results.
132+
suggestions = get_model_suggestions(self.CARDIO_TEXT)
133+
assert suggestions # still returns something useful
134+
assert all(info.category != "Cardiology" for _key, info, _reason in suggestions)
135+
62136

63137
# ---------------------------------------------------------------------------
64138
# normalize_label idempotency

0 commit comments

Comments
 (0)