Skip to content

Commit 2938275

Browse files
authored
Merge pull request #14 from thehyve/issue-11-case-insensitive-homonym-option
enable case insensitive homonym lookups
2 parents 225b499 + 4db02b0 commit 2938275

File tree

8 files changed

+101
-8
lines changed

8 files changed

+101
-8
lines changed

docs/kotobuki.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ update_usagi_file(
4040
vocab_schema="my_cdm",
4141
usagi_file=usagi_file,
4242
allow_homonyms=False,
43+
ignore_case=False,
4344
write_map_paths=False,
4445
inspect_only=False,
4546
overwrite=False,
@@ -86,9 +87,16 @@ All available homonyms will be parsed for concept relationships (just like the o
8687
non-standard concept) until a standard concept is found or all relationship paths have
8788
been traversed.
8889

90+
For some homonym concepts, the case may be different (e.g. diabetes type 2 vs Diabetes
91+
Type 2). It needs to be specified within the use of kotobuki whether such homonyms
92+
should be included.
93+
8994
To include homonyms, add the `--allow-homonyms`/`-h` flag (CLI), or provide
9095
`allow_homonyms=True` (Python).
9196

97+
To search for homonyms regardless of case, add the `--case-insensitive`/`-i`
98+
flag (CLI), or provide `case_insensitive=True` (Python).
99+
92100
> ⚠️ **WARNING:**
93101
> Searching for standard concepts via homonyms is less reliable than via the concept
94102
> relationships, especially for concepts with a short name.

src/kotobuki/mapping_updater/cli.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@
3939
"try to find it through concepts with the same concept name. It is "
4040
"strongly recommended to index the concept_name column when using this.",
4141
)
42+
@click.option(
43+
"-i",
44+
"--ignore-case",
45+
is_flag=True,
46+
default=False,
47+
help="When searching for homonyms, do so in a case-insensitive way. "
48+
"It is strongly recommended to create a functional index on the "
49+
"concept table for this.",
50+
)
4251
@click.option(
4352
"-m",
4453
"--write-map-paths",
@@ -73,6 +82,7 @@ def _update_usagi_cli(
7382
schema: str,
7483
usagi_file: Path,
7584
allow_homonyms: bool,
85+
ignore_case: bool,
7686
write_map_paths: bool,
7787
inspect_only: bool,
7888
overwrite: bool,
@@ -87,6 +97,7 @@ def _update_usagi_cli(
8797
schema,
8898
usagi_file,
8999
allow_homonyms,
100+
ignore_case,
90101
write_map_paths,
91102
inspect_only,
92103
overwrite,

src/kotobuki/mapping_updater/db.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from collections.abc import Sequence
22

33
from omop_cdm.regular.cdm54 import Concept, ConceptRelationship
4-
from sqlalchemy import and_, select
4+
from sqlalchemy import and_, func, select
55
from sqlalchemy.orm import Session
66

77
from .relationship import (
@@ -65,9 +65,29 @@ def find_standard_concepts(
6565
return None
6666

6767

68-
def find_all_homonyms(concept_name: str, session: Session) -> Sequence[Concept]:
68+
def find_all_homonyms(
69+
concept: Concept, case_insensitive: bool, session: Session
70+
) -> Sequence[Concept]:
6971
"""Get Concept ORM objects that match the concept_name."""
70-
return session.scalars(select(Concept).where(Concept.concept_name == concept_name)).all()
72+
if case_insensitive:
73+
homonyms = session.scalars(
74+
select(Concept).where(
75+
and_(
76+
func.lower(Concept.concept_name) == concept.concept_name.lower(),
77+
Concept.concept_id != concept.concept_id,
78+
)
79+
)
80+
).all()
81+
else:
82+
homonyms = session.scalars(
83+
select(Concept).where(
84+
and_(
85+
Concept.concept_name == concept.concept_name,
86+
Concept.concept_id != concept.concept_id,
87+
)
88+
)
89+
).all()
90+
return homonyms
7191

7292

7393
def find_suitable_homonym(
@@ -84,11 +104,16 @@ def find_suitable_homonym(
84104
return None
85105

86106

87-
def find_new_mapping(concept: Concept, search_homonyms: bool, session: Session) -> NewMap | None:
107+
def find_new_mapping(
108+
concept: Concept,
109+
search_homonyms: bool,
110+
ignore_case: bool,
111+
session: Session,
112+
) -> NewMap | None:
88113
# Try to find standard concepts via concept relationships
89114
new_map = find_standard_concepts(concept.concept_id, session, [MapLink(concept)])
90115
# Alternatively via concepts with an identical name
91116
if search_homonyms and new_map is None:
92-
homonyms = find_all_homonyms(concept.concept_name, session)
117+
homonyms = find_all_homonyms(concept, ignore_case, session)
93118
new_map = find_suitable_homonym(homonyms, session, concept)
94119
return new_map

src/kotobuki/mapping_updater/update_usagi.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def update_usagi_file(
3030
vocab_schema: str,
3131
usagi_file: Path,
3232
allow_homonyms: bool = False,
33+
ignore_case: bool = False,
3334
write_map_paths: bool = False,
3435
inspect_only: bool = False,
3536
overwrite: bool = False,
@@ -87,7 +88,7 @@ def update_usagi_file(
8788

8889
logger.info("Querying database for standard concepts...")
8990
for concept in non_standard:
90-
new_map = find_new_mapping(concept, allow_homonyms, session)
91+
new_map = find_new_mapping(concept, allow_homonyms, ignore_case, session)
9192
new_mappings[concept.concept_id] = new_map
9293
log_remapped_concepts(new_mappings)
9394

tests/python/mapping_updater/db_setup/vocab_data/CONCEPT.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@ concept_id concept_name domain_id vocabulary_id concept_class_id standard_concep
1818
16 val2 0 0 0 S v2 2017-01-01 2099-12-31
1919
17 qwerty Condition 0 0 S qwerty 2017-01-01 2099-12-31
2020
18 azerty Condition 0 0 azerty 2017-01-01 2099-12-31
21+
19 QzErTy Condition 0 0 QzErTy 2017-01-01 2099-12-31
22+
20 qzerty Condition 0 0 S qzerty 2017-01-01 2099-12-31

tests/python/mapping_updater/db_setup/vocab_data/CONCEPT_RELATIONSHIP.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ concept_id_1 concept_id_2 relationship_id valid_start_date valid_end_date invali
1212
13 14 Maps to 1970-01-01 2099-12-31
1313
13 15 Maps to value 1970-01-01 2099-12-31
1414
13 16 Maps to value 1970-01-01 2099-12-31
15+
20 20 Maps to 2017-01-01 2099-12-31
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import pytest
2+
from sqlalchemy import Engine
3+
4+
from tests.python.mapping_updater.test_usagi_mappings import get_new_map
5+
6+
pytestmark = pytest.mark.usefixtures("create_vocab_tables")
7+
8+
9+
def test_find_standard_concept_via_case_insensitive_homonym(pg_db_engine: Engine):
10+
"""
11+
The concept QzErTy is non-standard without a relationship to a standard concept,
12+
however, there is the standard homonym "qzerty". Test if this homonym is picked up correctly.
13+
"""
14+
result = get_new_map(
15+
concept_id=19,
16+
engine=pg_db_engine,
17+
homonyms=True,
18+
ignore_case=True,
19+
)
20+
assert len(result.concepts) == 1
21+
assert result.concepts[0].concept_id == 20
22+
assert not result.value_as_concept
23+
24+
25+
def test_ignore_case_false(pg_db_engine: Engine):
26+
"""
27+
Test if "qzerty" is not picked up from "QzErTy" when ignore_case is False.
28+
"""
29+
result = get_new_map(
30+
concept_id=19,
31+
engine=pg_db_engine,
32+
homonyms=True,
33+
ignore_case=False,
34+
)
35+
assert result is None

tests/python/mapping_updater/test_usagi_mappings.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,20 @@ def get_concept_by_id(session: Session, concept_id: int) -> Concept | None:
1313
return session.scalars(select(Concept).filter(Concept.concept_id == concept_id)).one_or_none()
1414

1515

16-
def get_new_map(concept_id: int, engine: Engine, homonyms: bool = False) -> NewMap | None:
16+
def get_new_map(
17+
concept_id: int,
18+
engine: Engine,
19+
homonyms: bool = False,
20+
ignore_case: bool = False,
21+
) -> NewMap | None:
1722
with Session(engine, expire_on_commit=False) as session, session.begin():
1823
concept = get_concept_by_id(session, concept_id=concept_id)
19-
return find_new_mapping(concept=concept, search_homonyms=homonyms, session=session)
24+
return find_new_mapping(
25+
concept=concept,
26+
search_homonyms=homonyms,
27+
ignore_case=ignore_case,
28+
session=session,
29+
)
2030

2131

2232
def test_no_map(pg_db_engine: Engine):

0 commit comments

Comments
 (0)