Skip to content

Commit 5368cc3

Browse files
authored
Merge pull request #474 from allenai/052_upgrade
Update to the latest UMLS version
2 parents 4f9ba09 + d1aabb3 commit 5368cc3

17 files changed

+189
-72
lines changed

.flake8

+3-8
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,16 @@
22
max-line-length = 115
33

44
ignore =
5-
# these rules don't play well with black
6-
E203 # whitespace before :
7-
W503 # line break before binary operator
8-
W504 # line break after binary operator
5+
E203
6+
W503
7+
W504
98

109
exclude =
1110
build/**
1211
docs/**
1312

1413
per-file-ignores =
15-
# __init__.py files are allowed to have unused imports and lines-too-long
1614
scispacy/__init__.py:F401
1715
scispacy/**/__init__.py:F401,E501
1816

19-
# scripts don't have to respect
20-
# E501: line length
21-
# E402: imports not at top of file (because we mess with sys.path)
2217
scripts/**:E501,E402

evaluation/evaluate_linker.py

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import spacy
2+
from scispacy.linking import EntityLinker
3+
from scispacy.data_util import read_full_med_mentions
4+
import os
5+
from tqdm import tqdm
6+
7+
EVALUATION_FOLDER_PATH = os.path.dirname(os.path.abspath(__file__))
8+
9+
10+
def main():
11+
nlp = spacy.load("en_core_sci_sm")
12+
nlp.add_pipe(
13+
"scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"}
14+
)
15+
linker = nlp.get_pipe("scispacy_linker")
16+
17+
med_mentions = read_full_med_mentions(
18+
os.path.join(EVALUATION_FOLDER_PATH, os.pardir, "data", "med_mentions"),
19+
use_umls_ids=True,
20+
)
21+
22+
test_data = med_mentions[2]
23+
24+
total_entities = 0
25+
correct_at_1 = 0
26+
correct_at_2 = 0
27+
correct_at_10 = 0
28+
correct_at_40 = 0
29+
correct_at_60 = 0
30+
correct_at_80 = 0
31+
correct_at_100 = 0
32+
for text_doc, entities in tqdm(test_data):
33+
for start, end, label in entities["entities"]:
34+
text_span = text_doc[start:end]
35+
candidates = linker.candidate_generator([text_span], 40)[0]
36+
sorted_candidates = sorted(
37+
candidates, reverse=True, key=lambda x: max(x.similarities)
38+
)
39+
candidate_ids = [c.concept_id for c in sorted_candidates]
40+
if label in candidate_ids[:1]:
41+
correct_at_1 += 1
42+
if label in candidate_ids[:2]:
43+
correct_at_2 += 1
44+
if label in candidate_ids[:10]:
45+
correct_at_10 += 1
46+
if label in candidate_ids[:40]:
47+
correct_at_40 += 1
48+
# if label in candidate_ids[:60]:
49+
# correct_at_60 += 1
50+
# if label in candidate_ids[:80]:
51+
# correct_at_80 += 1
52+
# if label in candidate_ids[:100]:
53+
# correct_at_100 += 1
54+
55+
total_entities += 1
56+
57+
print("Total entities: ", total_entities)
58+
print(
59+
"Correct at 1: ", correct_at_1, "Recall at 1: ", correct_at_1 / total_entities
60+
)
61+
print(
62+
"Correct at 2: ", correct_at_2, "Recall at 2: ", correct_at_2 / total_entities
63+
)
64+
print(
65+
"Correct at 10: ",
66+
correct_at_10,
67+
"Recall at 10: ",
68+
correct_at_10 / total_entities,
69+
)
70+
print(
71+
"Correct at 40: ",
72+
correct_at_40,
73+
"Recall at 40: ",
74+
correct_at_40 / total_entities,
75+
)
76+
# print(
77+
# "Correct at 60: ",
78+
# correct_at_60,
79+
# "Recall at 60: ",
80+
# correct_at_60 / total_entities,
81+
# )
82+
# print(
83+
# "Correct at 80: ",
84+
# correct_at_80,
85+
# "Recall at 80: ",
86+
# correct_at_80 / total_entities,
87+
# )
88+
# print(
89+
# "Correct at 100: ",
90+
# correct_at_100,
91+
# "Recall at 100: ",
92+
# correct_at_100 / total_entities,
93+
# )
94+
95+
96+
if __name__ == "__main__":
97+
main()

project.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ title: "scispaCy pipeline"
22
description: "All the steps needed in the scispaCy pipeline"
33

44
vars:
5-
version_string: "0.5.1"
5+
version_string: "0.5.2"
66
gpu_id: 0
77
freqs_loc_s3: "s3://ai2-s2-scispacy/data/gorc_subset.freqs"
88
freqs_loc_local: "assets/gorc_subset.freqs"

requirements.in

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ flake8
1818
black
1919
mypy
2020
types-requests
21+
types-setuptools
22+
types-tabulate
2123

2224
# Required for releases.
2325
twine

scispacy/abbreviation.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,20 @@ def find_abbreviation(
8282
return short_form_candidate, long_form_candidate[starting_index:]
8383

8484

85+
def span_contains_unbalanced_parentheses(span: Span) -> bool:
86+
stack_counter = 0
87+
for token in span:
88+
if token.text == "(":
89+
stack_counter += 1
90+
elif token.text == ")":
91+
if stack_counter > 0:
92+
stack_counter -= 1
93+
else:
94+
return True
95+
96+
return stack_counter != 0
97+
98+
8599
def filter_matches(
86100
matcher_output: List[Tuple[int, int, int]], doc: Doc
87101
) -> List[Tuple[Span, Span]]:
@@ -100,6 +114,10 @@ def filter_matches(
100114
# Take one word before.
101115
short_form_candidate = doc[start - 2 : start - 1]
102116
long_form_candidate = doc[start:end]
117+
118+
# make sure any parentheses inside long form are balanced
119+
if span_contains_unbalanced_parentheses(long_form_candidate):
120+
continue
103121
else:
104122
# Normal case.
105123
# Short form is inside the parens.
@@ -190,7 +208,7 @@ def __call__(self, doc: Doc) -> Doc:
190208
filtered = filter_matches(matches_no_brackets, doc)
191209
occurences = self.find_matches_for(filtered, doc)
192210

193-
for (long_form, short_forms) in occurences:
211+
for long_form, short_forms in occurences:
194212
for short in short_forms:
195213
short._.long_form = long_form
196214
doc._.abbreviations.append(short)
@@ -209,7 +227,7 @@ def find_matches_for(
209227
all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
210228
already_seen_long: Set[str] = set()
211229
already_seen_short: Set[str] = set()
212-
for (long_candidate, short_candidate) in filtered:
230+
for long_candidate, short_candidate in filtered:
213231
short, long = find_abbreviation(long_candidate, short_candidate)
214232
# We need the long and short form definitions to be unique, because we need
215233
# to store them so we can look them up later. This is a bit of a

scispacy/candidate_generation.py

+27-28
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Dict, Tuple, NamedTuple, Type
1+
from typing import Optional, List, Dict, Tuple, NamedTuple, Type
22
import json
33
import datetime
44
from collections import defaultdict
@@ -41,38 +41,38 @@ class LinkerPaths(NamedTuple):
4141

4242

4343
UmlsLinkerPaths = LinkerPaths(
44-
ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/nmslib_index.bin", # noqa
45-
tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectorizer.joblib", # noqa
46-
tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectors_sparse.npz", # noqa
47-
concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/concept_aliases.json", # noqa
44+
ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin", # noqa
45+
tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib", # noqa
46+
tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz", # noqa
47+
concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json", # noqa
4848
)
4949

5050
MeshLinkerPaths = LinkerPaths(
51-
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/mesh/nmslib_index.bin", # noqa
52-
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/mesh/tfidf_vectorizer.joblib", # noqa
53-
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/mesh/tfidf_vectors_sparse.npz", # noqa
54-
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/mesh/concept_aliases.json", # noqa
51+
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/mesh/nmslib_index.bin", # noqa
52+
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/mesh/tfidf_vectorizer.joblib", # noqa
53+
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/mesh/tfidf_vectors_sparse.npz", # noqa
54+
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/mesh/concept_aliases.json", # noqa
5555
)
5656

5757
GeneOntologyLinkerPaths = LinkerPaths(
58-
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/go/nmslib_index.bin", # noqa
59-
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/go/tfidf_vectorizer.joblib", # noqa
60-
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/go/tfidf_vectors_sparse.npz", # noqa
61-
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/go/concept_aliases.json", # noqa
58+
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/go/nmslib_index.bin", # noqa
59+
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/go/tfidf_vectorizer.joblib", # noqa
60+
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/go/tfidf_vectors_sparse.npz", # noqa
61+
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/go/concept_aliases.json", # noqa
6262
)
6363

6464
HumanPhenotypeOntologyLinkerPaths = LinkerPaths(
65-
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/hpo/nmslib_index.bin", # noqa
66-
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/hpo/tfidf_vectorizer.joblib", # noqa
67-
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/hpo/tfidf_vectors_sparse.npz", # noqa
68-
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/hpo/concept_aliases.json", # noqa
65+
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/hpo/nmslib_index.bin", # noqa
66+
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/hpo/tfidf_vectorizer.joblib", # noqa
67+
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/hpo/tfidf_vectors_sparse.npz", # noqa
68+
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/hpo/concept_aliases.json", # noqa
6969
)
7070

7171
RxNormLinkerPaths = LinkerPaths(
72-
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/rxnorm/nmslib_index.bin", # noqa
73-
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/rxnorm/tfidf_vectorizer.joblib", # noqa
74-
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/rxnorm/tfidf_vectors_sparse.npz", # noqa
75-
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2020-10-09/rxnorm/concept_aliases.json", # noqa
72+
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/rxnorm/nmslib_index.bin", # noqa
73+
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/rxnorm/tfidf_vectorizer.joblib", # noqa
74+
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/rxnorm/tfidf_vectors_sparse.npz", # noqa
75+
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/linkers/2023-04-23/rxnorm/concept_aliases.json", # noqa
7676
)
7777

7878

@@ -196,15 +196,14 @@ class CandidateGenerator:
196196

197197
def __init__(
198198
self,
199-
ann_index: FloatIndex = None,
200-
tfidf_vectorizer: TfidfVectorizer = None,
201-
ann_concept_aliases_list: List[str] = None,
202-
kb: KnowledgeBase = None,
199+
ann_index: Optional[FloatIndex] = None,
200+
tfidf_vectorizer: Optional[TfidfVectorizer] = None,
201+
ann_concept_aliases_list: Optional[List[str]] = None,
202+
kb: Optional[KnowledgeBase] = None,
203203
verbose: bool = False,
204204
ef_search: int = 200,
205-
name: str = None,
205+
name: Optional[str] = None,
206206
) -> None:
207-
208207
if name is not None and any(
209208
[ann_index, tfidf_vectorizer, ann_concept_aliases_list, kb]
210209
):
@@ -363,7 +362,7 @@ def __call__(
363362

364363

365364
def create_tfidf_ann_index(
366-
out_path: str, kb: KnowledgeBase = None
365+
out_path: str, kb: Optional[KnowledgeBase] = None
367366
) -> Tuple[List[str], TfidfVectorizer, FloatIndex]:
368367
"""
369368
Build tfidf vectorizer and ann index.

scispacy/data_util.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import NamedTuple, List, Iterator, Dict, Tuple
1+
from typing import Optional, NamedTuple, List, Iterator, Dict, Tuple
22
import tarfile
33
import atexit
44
import os
@@ -148,9 +148,10 @@ def remove_overlapping_entities(
148148

149149
def read_full_med_mentions(
150150
directory_path: str,
151-
label_mapping: Dict[str, str] = None,
151+
label_mapping: Optional[Dict[str, str]] = None,
152152
span_only: bool = False,
153153
spacy_format: bool = True,
154+
use_umls_ids: bool = False,
154155
):
155156
def _cleanup_dir(dir_path: str):
156157
if os.path.exists(dir_path):
@@ -209,7 +210,12 @@ def label_function(label):
209210

210211
for example in examples:
211212
spacy_format_entities = [
212-
(x.start, x.end, label_function(x.mention_type)) for x in example.entities
213+
(
214+
x.start,
215+
x.end,
216+
label_function(x.mention_type) if not use_umls_ids else x.umls_id,
217+
)
218+
for x in example.entities
213219
]
214220
spacy_format_entities = remove_overlapping_entities(
215221
sorted(spacy_format_entities, key=lambda x: x[0])

scispacy/file_cache.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import json
99
from urllib.parse import urlparse
1010
from pathlib import Path
11-
from typing import Tuple, Union, IO
11+
from typing import Optional, Tuple, Union, IO
1212
from hashlib import sha256
1313

1414
import requests
@@ -17,7 +17,9 @@
1717
DATASET_CACHE = str(CACHE_ROOT / "datasets")
1818

1919

20-
def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str:
20+
def cached_path(
21+
url_or_filename: Union[str, Path], cache_dir: Optional[str] = None
22+
) -> str:
2123
"""
2224
Given something that might be a URL (or might be a local path),
2325
determine which. If it's a URL, download the file and cache it, and
@@ -47,7 +49,7 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str
4749
)
4850

4951

50-
def url_to_filename(url: str, etag: str = None) -> str:
52+
def url_to_filename(url: str, etag: Optional[str] = None) -> str:
5153
"""
5254
Convert `url` into a hashed filename in a repeatable way.
5355
If `etag` is specified, append its hash to the url's, delimited
@@ -68,7 +70,7 @@ def url_to_filename(url: str, etag: str = None) -> str:
6870
return filename
6971

7072

71-
def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]:
73+
def filename_to_url(filename: str, cache_dir: Optional[str] = None) -> Tuple[str, str]:
7274
"""
7375
Return the url and etag (which may be ``None``) stored for `filename`.
7476
Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
@@ -99,7 +101,7 @@ def http_get(url: str, temp_file: IO) -> None:
99101
temp_file.write(chunk)
100102

101103

102-
def get_from_cache(url: str, cache_dir: str = None) -> str:
104+
def get_from_cache(url: str, cache_dir: Optional[str] = None) -> str:
103105
"""
104106
Given a URL, look for the corresponding dataset in the local cache.
105107
If it's not there, download it. Then return the path to the cached file.

scispacy/hyponym_detector.py

-3
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ class HyponymDetector:
3636
def __init__(
3737
self, nlp: Language, name: str = "hyponym_detector", extended: bool = False
3838
):
39-
4039
self.nlp = nlp
4140

4241
self.patterns = BASE_PATTERNS
@@ -91,7 +90,6 @@ def expand_to_noun_compound(self, token: Token, doc: Doc):
9190
return doc[start:end]
9291

9392
def find_noun_compound_head(self, token: Token):
94-
9593
while token.head.pos_ in {"PROPN", "NOUN", "PRON"} and token.dep_ == "compound":
9694
token = token.head
9795
return token
@@ -135,7 +133,6 @@ def __call__(self, doc: Doc):
135133
)
136134

137135
for token in hyponym.conjuncts:
138-
139136
token_extended = self.expand_to_noun_compound(token, doc)
140137
if token != hypernym and token is not None:
141138
doc._.hearst_patterns.append(

0 commit comments

Comments
 (0)