Skip to content

Commit 7fc3562

Browse files
feat: refactor ancestry mapping to include distance from descendant node + implement functions to support curated list term mapping (#96)
1 parent dfe0b39 commit 7fc3562

21 files changed

+285
-72
lines changed

api/python/src/cellxgene_ontology_guide/ontology_parser.py

+125-11
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ def _parse_ontology_name(self, term_id: str) -> str:
4141

4242
def get_term_ancestors(self, term_id: str, include_self: bool = False) -> List[str]:
4343
"""
44-
Get the ancestor ontology terms for a given term. If include_self is True, the term itself will be included as an
45-
ancestor.
44+
Get the ancestor ontology terms for a given term. If include_self is True, the term itself will be included as
45+
an ancestor.
4646
4747
Example: get_term_ancestors("CL:0000005") -> ["CL:0000000", ...]
4848
@@ -51,13 +51,30 @@ def get_term_ancestors(self, term_id: str, include_self: bool = False) -> List[s
5151
:return: flattened List[str] of ancestor terms
5252
"""
5353
ontology_name = self._parse_ontology_name(term_id)
54-
ancestors: List[str] = self.cxg_schema.ontology(ontology_name)[term_id]["ancestors"]
54+
ancestors = list(self.cxg_schema.ontology(ontology_name)[term_id]["ancestors"].keys())
5555
return ancestors + [term_id] if include_self else ancestors
5656

57-
def get_term_list_ancestors(self, term_ids: str, include_self: bool = False) -> Dict[str, List[str]]:
57+
def get_term_ancestors_with_distances(self, term_id: str, include_self: bool = False) -> Dict[str, int]:
5858
"""
59-
Get the ancestor ontology terms for each term in a list. If include_self is True, the term itself will be included
60-
as an ancestor.
59+
Get the ancestor ontology terms for a given term, and their distance from the term_id. If include_self is True,
60+
the term itself will be included as an ancestor.
61+
62+
Example: get_term_ancestors_with_distances("CL:0000005") -> {"CL:0000000": 1, ...}
63+
64+
:param term_id: str ontology term to find ancestors for
65+
:param include_self: boolean flag to include the term itself as an ancestor
66+
:return: Dict[str, int] map of ancestor terms and their respective distances from the term_id
67+
"""
68+
ontology_name = self._parse_ontology_name(term_id)
69+
ancestors: Dict[str, int] = self.cxg_schema.ontology(ontology_name)[term_id]["ancestors"]
70+
if include_self:
71+
ancestors[term_id] = 0
72+
return ancestors
73+
74+
def get_term_list_ancestors(self, term_ids: List[str], include_self: bool = False) -> Dict[str, List[str]]:
75+
"""
76+
Get the ancestor ontology terms for each term in a list. If include_self is True, the term itself will be
77+
included as an ancestor.
6178
6279
Example: get_term_list_ancestors(["CL:0000003", "CL:0000005"], include_self=True) -> {
6380
"CL:0000003": ["CL:0000003"],
@@ -71,10 +88,106 @@ def get_term_list_ancestors(self, term_ids: str, include_self: bool = False) ->
7188
"""
7289
return {term_id: self.get_term_ancestors(term_id, include_self) for term_id in term_ids}
7390

91+
def map_high_level_terms(self, term_ids: List[str], high_level_terms: List[str]) -> Dict[str, List[str]]:
92+
"""
93+
Given a list of ontology term IDs and a list of high_level_terms to map them to, returns a dictionary with
94+
format
95+
96+
{"CL:0000003": ["CL:0000000", ...], "CL:0000005": ["CL:0000000", ...]}
97+
98+
Where each term_id is mapped to a List[str] of high-level terms that it is a descendant of. Includes self
99+
as a descendant.
100+
101+
:param term_ids: list of str ontology terms to map high level terms for
102+
:param high_level_terms: list of str ontology terms that can be mapped to descendant term_ids
103+
:return: Dictionary mapping str term IDs to their respective List[str] of ancestor terms from the input list.
104+
Each key maps to empty list if there are no ancestors among the provided input.
105+
"""
106+
ancestors = self.get_term_list_ancestors(term_ids, include_self=True)
107+
for term_id in term_ids:
108+
ancestors[term_id] = [
109+
high_level_term for high_level_term in ancestors[term_id] if high_level_term in high_level_terms
110+
]
111+
return ancestors
112+
113+
def get_distance_between_terms(self, term_id_1: str, term_id_2: str) -> int:
114+
"""
115+
Get the distance between two ontology terms. The distance is defined as the number of edges between the
116+
two terms. Terms must be from the same ontology. Returns -1 if terms are disjoint.
117+
118+
:param term_id_1: str ontology term to find distance for
119+
:param term_id_2: str ontology term to find distance for
120+
:return: int distance between the two terms, measured in number of edges between their shortest path.
121+
"""
122+
lcas = self.get_lowest_common_ancestors(term_id_1, term_id_2)
123+
if not lcas:
124+
return -1
125+
return int(
126+
self.get_term_ancestors_with_distances(term_id_1, include_self=True)[lcas[0]]
127+
+ self.get_term_ancestors_with_distances(term_id_2, include_self=True)[lcas[0]]
128+
)
129+
130+
def get_lowest_common_ancestors(self, term_id_1: str, term_id_2: str) -> List[str]:
131+
"""
132+
Get the lowest common ancestors between two ontology terms that is from the given ontology.
133+
Terms must be from the same ontology. Ontologies are DAGs, so there may be multiple lowest common ancestors.
134+
135+
:param term_id_1: str ontology term to find LCA for
136+
:param term_id_2: str ontology term to find LCA for
137+
:return: str term ID of the lowest common ancestor term
138+
"""
139+
# include path to term itself
140+
ontology = self._parse_ontology_name(term_id_1)
141+
if ontology != self._parse_ontology_name(term_id_2):
142+
return []
143+
ancestors_1 = self.get_term_ancestors_with_distances(term_id_1, include_self=True)
144+
ancestors_2 = self.get_term_ancestors_with_distances(term_id_2, include_self=True)
145+
common_ancestors = set(ancestors_1.keys()) & set(ancestors_2.keys())
146+
min_sum_distances = float("inf")
147+
for ancestors in common_ancestors:
148+
sum_distances = ancestors_1[ancestors] + ancestors_2[ancestors]
149+
if sum_distances < min_sum_distances:
150+
min_sum_distances = sum_distances
151+
return [
152+
ancestor
153+
for ancestor in common_ancestors
154+
if ancestors_1[ancestor] + ancestors_2[ancestor] == min_sum_distances
155+
]
156+
157+
def map_highest_level_term(self, term_ids: List[str], high_level_terms: List[str]) -> Dict[str, Union[str, None]]:
158+
"""
159+
Given a list of ontology term IDs and a list of high_level_terms to map them to, returns a dictionary with
160+
format
161+
162+
{"CL:0000003": "CL:0000000", "CL:0000005": "CL:0000000"}
163+
164+
Where each term_id is mapped to the highest level term that it is a descendant of, from the list provided. Includes
165+
term itself as a descendant. Maps to None if term_id does not map to any high level terms among the provided input.
166+
167+
:param term_ids: list of str ontology terms to map high level terms for
168+
:param high_level_terms: list of str ontology terms that can be mapped to descendant term_ids
169+
:return: Dictionary mapping str term IDs to their respective List[str] of ancestor terms from the input list.
170+
Each key maps to empty list if there are no ancestors among the provided input.
171+
"""
172+
high_level_term_map = self.map_high_level_terms(term_ids, high_level_terms)
173+
highest_level_term_map = dict()
174+
for term_id in term_ids:
175+
term_ancestors_and_distances = self.get_term_ancestors_with_distances(term_id, include_self=True)
176+
# map term_id to the high_level_term with the longest distance from term_id
177+
highest_level_term_map[term_id] = (
178+
max(
179+
high_level_term_map[term_id],
180+
key=lambda high_level_term: term_ancestors_and_distances[high_level_term],
181+
)
182+
if high_level_term_map[term_id]
183+
else None
184+
)
185+
return highest_level_term_map
186+
74187
def get_terms_descendants(self, term_ids: List[str], include_self: bool = False) -> Dict[str, List[str]]:
75188
"""
76-
Get the descendant ontology terms for each term in a list. If include_self is True, the term itself will be included
77-
as a descendant.
189+
Get the descendant ontology terms for each term in a list. If include_self is True, the term itself will be
190+
included as a descendant.
78191
79192
Example: get_terms_descendants(["CL:0000003", "CL:0000005"], include_self=True) -> {
80193
"CL:0000003": ["CL:0000003", "CL:0000004", ...],
@@ -83,8 +196,8 @@ def get_terms_descendants(self, term_ids: List[str], include_self: bool = False)
83196
84197
:param term_ids: list of str ontology terms to find descendants for
85198
:param include_self: boolean flag to include the term itself as an descendant
86-
:return: Dictionary mapping str term IDs to their respective flattened List[str] of descendant terms. Maps to empty
87-
list if there are no descendants.
199+
:return: Dictionary mapping str term IDs to their respective flattened List[str] of descendant terms. Maps to
200+
empty list if there are no descendants.
88201
"""
89202
descendants_dict = dict()
90203
ontology_names = set()
@@ -96,7 +209,8 @@ def get_terms_descendants(self, term_ids: List[str], include_self: bool = False)
96209
for ontology in ontology_names:
97210
for candidate_descendant, candidate_metadata in self.cxg_schema.ontology(ontology).items():
98211
for ancestor_id in descendants_dict:
99-
if ancestor_id in candidate_metadata["ancestors"]:
212+
ancestors = candidate_metadata["ancestors"].keys()
213+
if ancestor_id in ancestors:
100214
descendants_dict[ancestor_id].append(candidate_descendant)
101215

102216
return descendants_dict

api/python/tests/test_ontology_parser.py

+104-12
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,35 @@
99
@pytest.fixture
1010
def ontology_dict():
1111
return {
12-
"CL:0000000": {"ancestors": [], "label": "cell A", "deprecated": False},
12+
"CL:0000000": {"ancestors": {}, "label": "cell A", "deprecated": False},
1313
"CL:0000001": {
14-
"ancestors": ["CL:0000000"],
14+
"ancestors": {"CL:0000000": 1},
1515
"label": "cell B",
1616
"deprecated": False,
1717
"consider": ["CL:0000004"],
1818
},
19-
"CL:0000002": {"ancestors": ["CL:0000000"], "label": "cell C", "deprecated": False},
19+
"CL:0000002": {"ancestors": {"CL:0000000": 1}, "label": "cell C", "deprecated": False},
2020
"CL:0000003": {
21-
"ancestors": ["CL:0000000"],
21+
"ancestors": {"CL:0000000": 1},
2222
"label": "obsolete cell",
2323
"deprecated": True,
2424
"replaced_by": "CL:0000004",
2525
"comments": ["this term was deprecated in favor of a descendant term of CL:0000001"],
2626
"term_tracker": "http://example.com/issue/1234",
2727
},
28-
"CL:0000004": {"ancestors": ["CL:0000001", "CL:0000000"], "label": "cell B2", "deprecated": False},
28+
"CL:0000004": {
29+
"ancestors": {"CL:0000000": 2, "CL:0000001": 1, "CL:0000002": 1},
30+
"label": "cell BC",
31+
"deprecated": False,
32+
},
33+
"CL:0000005": {
34+
"ancestors": {"CL:0000000": 2, "CL:0000001": 1, "CL:0000002": 1},
35+
"label": "cell BC2",
36+
"deprecated": False,
37+
},
38+
"CL:0000006": {"ancestors": {"CL:0000000": 2, "CL:0000001": 1}, "label": "cell B2", "deprecated": False},
39+
"CL:0000007": {"ancestors": {"CL:0000000": 2, "CL:0000001": 1}, "label": "cell B3", "deprecated": False},
40+
"CL:0000008": {"ancestors": {}, "label": "cell unrelated", "deprecated": False},
2941
}
3042

3143

@@ -62,32 +74,64 @@ def test_parse_ontology_name__not_supported(ontology_parser):
6274

6375

6476
def test_get_term_ancestors(ontology_parser):
65-
assert ontology_parser.get_term_ancestors("CL:0000004") == ["CL:0000001", "CL:0000000"]
77+
assert ontology_parser.get_term_ancestors("CL:0000004") == ["CL:0000000", "CL:0000001", "CL:0000002"]
6678
assert ontology_parser.get_term_ancestors("CL:0000004", include_self=True) == [
67-
"CL:0000001",
6879
"CL:0000000",
80+
"CL:0000001",
81+
"CL:0000002",
6982
"CL:0000004",
7083
]
7184

7285

86+
def test_get_term_ancestors_with_distances(ontology_parser):
87+
assert ontology_parser.get_term_ancestors_with_distances("CL:0000004") == {
88+
"CL:0000000": 2,
89+
"CL:0000001": 1,
90+
"CL:0000002": 1,
91+
}
92+
assert ontology_parser.get_term_ancestors_with_distances("CL:0000004", include_self=True) == {
93+
"CL:0000000": 2,
94+
"CL:0000001": 1,
95+
"CL:0000002": 1,
96+
"CL:0000004": 0,
97+
}
98+
99+
73100
def test_get_term_list_ancestors(ontology_parser):
74101
assert ontology_parser.get_term_list_ancestors(["CL:0000000", "CL:0000004"]) == {
75102
"CL:0000000": [],
76-
"CL:0000004": ["CL:0000001", "CL:0000000"],
103+
"CL:0000004": ["CL:0000000", "CL:0000001", "CL:0000002"],
77104
}
78105
assert ontology_parser.get_term_list_ancestors(["CL:0000000", "CL:0000004"], include_self=True) == {
79106
"CL:0000000": ["CL:0000000"],
80-
"CL:0000004": ["CL:0000001", "CL:0000000", "CL:0000004"],
107+
"CL:0000004": ["CL:0000000", "CL:0000001", "CL:0000002", "CL:0000004"],
81108
}
82109

83110

84111
def test_get_terms_descendants(ontology_parser):
85112
assert ontology_parser.get_terms_descendants(["CL:0000000", "CL:0000004"]) == {
86-
"CL:0000000": ["CL:0000001", "CL:0000002", "CL:0000003", "CL:0000004"],
113+
"CL:0000000": [
114+
"CL:0000001",
115+
"CL:0000002",
116+
"CL:0000003",
117+
"CL:0000004",
118+
"CL:0000005",
119+
"CL:0000006",
120+
"CL:0000007",
121+
],
87122
"CL:0000004": [],
88123
}
89124
assert ontology_parser.get_terms_descendants(["CL:0000000", "CL:0000004"], include_self=True) == {
90-
"CL:0000000": ["CL:0000000", "CL:0000001", "CL:0000002", "CL:0000003", "CL:0000004"],
125+
"CL:0000000": [
126+
"CL:0000000",
127+
"CL:0000001",
128+
"CL:0000002",
129+
"CL:0000003",
130+
"CL:0000004",
131+
"CL:0000005",
132+
"CL:0000006",
133+
"CL:0000007",
134+
],
91135
"CL:0000004": ["CL:0000004"],
92136
}
93137

@@ -116,7 +160,55 @@ def test_get_term_metadata(ontology_parser):
116160

117161

118162
def test_get_term_label(ontology_parser):
119-
assert ontology_parser.get_term_label("CL:0000004") == "cell B2"
163+
assert ontology_parser.get_term_label("CL:0000004") == "cell BC"
164+
165+
166+
def test_map_high_level_terms(ontology_parser):
167+
assert ontology_parser.map_high_level_terms(
168+
term_ids=["CL:0000000", "CL:0000008", "CL:0000004"],
169+
high_level_terms=["CL:0000000", "CL:0000001"],
170+
) == {"CL:0000000": ["CL:0000000"], "CL:0000008": [], "CL:0000004": ["CL:0000000", "CL:0000001"]}
171+
172+
173+
def test_map_highest_level_term(ontology_parser):
174+
assert ontology_parser.map_highest_level_term(
175+
term_ids=["CL:0000000", "CL:0000008", "CL:0000004"],
176+
high_level_terms=["CL:0000000", "CL:0000001"],
177+
) == {"CL:0000000": "CL:0000000", "CL:0000008": None, "CL:0000004": "CL:0000000"}
178+
179+
180+
def test_get_lowest_common_ancestors(ontology_parser):
181+
# root node LCA
182+
assert ontology_parser.get_lowest_common_ancestors(term_id_1="CL:0000003", term_id_2="CL:0000005") == ["CL:0000000"]
183+
184+
# sibling LCA
185+
assert ontology_parser.get_lowest_common_ancestors(term_id_1="CL:0000006", term_id_2="CL:0000007") == ["CL:0000001"]
186+
187+
# parent-child LCA
188+
assert ontology_parser.get_lowest_common_ancestors(term_id_1="CL:0000002", term_id_2="CL:0000005") == ["CL:0000002"]
189+
190+
# multiple node
191+
lcas = ontology_parser.get_lowest_common_ancestors(term_id_1="CL:0000004", term_id_2="CL:0000005")
192+
assert len(lcas) == 2
193+
assert "CL:0000001" in lcas
194+
assert "CL:0000002" in lcas
195+
196+
# disjoint
197+
assert ontology_parser.get_lowest_common_ancestors(term_id_1="CL:0000001", term_id_2="CL:0000008") == []
198+
199+
200+
def test_get_distance_between_terms(ontology_parser):
201+
# distance when root node is lca
202+
assert ontology_parser.get_distance_between_terms(term_id_1="CL:0000003", term_id_2="CL:0000005") == 3
203+
204+
# parent-child distance
205+
assert ontology_parser.get_distance_between_terms(term_id_1="CL:0000002", term_id_2="CL:0000005") == 1
206+
207+
# multiple LCAs distance
208+
assert ontology_parser.get_distance_between_terms(term_id_1="CL:0000004", term_id_2="CL:0000005") == 2
209+
210+
# disjoint distance
211+
assert ontology_parser.get_distance_between_terms(term_id_1="CL:0000001", term_id_2="CL:0000008") == -1
120212

121213

122214
def test_get_ontology_download_url(ontology_parser):

artifact-schemas/all_ontology_schema.json

+10-10
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
2-
"$schema": "http://json-schema.org/draft-07/schema#",
2+
"$schema": "http://json-schema.org/draft-07,/schema#",
33
"title": "Valid Ontology Term JSON Schema",
44
"description": "Schema for file containing metadata for Ontology Terms accepted in dataset submissions to CZ CellXGene Data Portal.",
55
"type": "object",
66
"patternProperties": {
7-
"^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$": {
7+
"^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7,}$": {
88
"type": "object",
99
"properties": {
1010
"label": {
@@ -16,13 +16,13 @@
1616
"description": "Indicates whether the ontology entry is deprecated."
1717
},
1818
"ancestors": {
19-
"type": "array",
20-
"items": {
21-
"type": "string",
22-
"pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$",
23-
"description": "List of ancestor IDs for the ontology entry."
24-
},
25-
"description": "An array of ancestor ontology terms that this term is a subclass of."
19+
"type": "object",
20+
"description": "A map of ancestor ontology terms that this term is a subclass of, keyed to the distance from the term.",
21+
"patternProperties": {
22+
"^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7,}$": {
23+
"type": "integer"
24+
}
25+
}
2626
},
2727
"comments": {
2828
"type": "array",
@@ -47,7 +47,7 @@
4747
},
4848
"replaced_by": {
4949
"type": "string",
50-
"pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$",
50+
"pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7,}$",
5151
"description": "If deprecated, the ID of the ontology entry that should canonically replace this one."
5252
}
5353
},

artifact-schemas/cell_class_list_schema.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"type": "array",
66
"items": {
77
"type": "string",
8-
"pattern": "^CL:[0-9]{7}$"
8+
"pattern": "^CL:[0-9]{7,}$"
99
},
1010
"minItems": 1,
1111
"uniqueItems": true

0 commit comments

Comments
 (0)