-
-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathgenerate_cl_mesh_mappings.py
More file actions
79 lines (65 loc) · 2.42 KB
/
generate_cl_mesh_mappings.py
File metadata and controls
79 lines (65 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Generate mappings from CL."""
import re
import gilda
import obonet
from bioregistry import NormalizedNamableReference
from curies.vocabulary import exact_match, lexical_matching_process
from indra.databases import mesh_client
from sssom_pydantic import MappingTool, SemanticMapping
from biomappings.resources import append_predictions
from biomappings.utils import get_script_url
g = obonet.read_obo(
"https://raw.githubusercontent.com/obophenotype/cell-ontology/master/cl-basic.obo"
)
mesh_tree_pattern = re.compile(r"MESH:[A-Z][0-9]+\.[0-9.]+")
mesh_id_pattern = re.compile(r"MESH:[CD][0-9]+")
mappings = {}
for node, data in g.nodes(data=True):
if not node.startswith("CL:"):
continue
has_mesh_id = False
for value in [data.get("def", ""), *data.get("synonym", []), *data.get("xref", [])]:
if re.findall(mesh_tree_pattern, value) or re.findall(mesh_id_pattern, value):
has_mesh_id = True
break
if has_mesh_id:
continue
matches = gilda.ground(data["name"]) # type:ignore[no-untyped-call]
if not matches:
if data["name"].endswith(" cells"):
matches = gilda.ground(data["name"].replace(" cells", "")) # type:ignore[no-untyped-call]
elif data["name"].endswith(" cell"):
matches = gilda.ground(data["name"].replace(" cell", "")) # type:ignore[no-untyped-call]
if not matches:
continue
mesh_ids = set()
for match in matches:
groundings = match.get_groundings()
mesh_ids |= {id for ns, id in groundings if ns == "MESH"}
if len(mesh_ids) > 1:
print(f"Multiple MESH IDs for {node}")
elif len(mesh_ids) == 1:
mesh_id = next(iter(mesh_ids))
mappings[node] = mesh_id
print(f"Found {len(mappings)} CL->MESH mappings.")
provenance = get_script_url(__file__)
predictions = []
for cl_id, mesh_id in mappings.items():
pred = SemanticMapping(
subject=NormalizedNamableReference(
prefix="cl",
identifier=cl_id,
name=g.nodes[cl_id]["name"],
),
predicate=exact_match,
object=NormalizedNamableReference(
prefix="mesh",
identifier=mesh_id,
name=mesh_client.get_mesh_name(mesh_id),
),
justification=lexical_matching_process,
confidence=0.9,
mapping_tool=MappingTool(name=provenance),
)
predictions.append(pred)
append_predictions(predictions)