Skip to content

Commit 5ba8e54

Browse files
authored
Merge pull request #2 from iross/abbrevs
Abbreviation expansion
2 parents ad015f2 + a3469bd commit 5ba8e54

File tree

7 files changed

+135
-3
lines changed

7 files changed

+135
-3
lines changed

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*data

.env

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# set to 1 to expand abbreviations via ALLIE (http://allie.dbcls.jp)
2+
3+
EXPAND_ABBREVIATIONS=1

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@ It is also possible to ingest the daily update files provided by MEDLINE
4040
(`ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/`). **BY DEFAULT, ALL UPDATE
4141
FILES WILL BE APPLIED IN THIS MODE**
4242

43+
## Abbreviation expansion
44+
Abberviation expansion is done via the ALLIE (http://allie.dbcls.jp) database.
45+
By default, abbrevations are kept as-is from PubMed, but by changing the setting in `.env`
46+
to
47+
48+
```
49+
EXPAND_ABBREVIATIONS=1
50+
```
51+
52+
The ALLIE database will be downloaded and installed into a postgres table. As the PubMed abstracts are ingested, this database is queried and any abbreviations found within the abstract are replaced with the long form, and the result is stored within the `abstract_long_form` field.
53+
4354
## Caveats
4455
- The intended use is for testing of query logic, and the JVM options set for
4556
Elasticsearch are set with this in mind.

docker-compose.yml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,29 @@ services:
3030
build: .
3131
networks:
3232
- kmnet
33-
command: "/bin/bash ./wait_for_it.sh -t 0 es01:9200 -- python index_pubmed.py bulk --n_min 1 --n_max 1"
33+
command: "wait-for-it -s es01:9200 -s km_postgres:5432 -- python index_pubmed.py bulk --n_min 1 --n_max 1"
3434
depends_on:
3535
- es01
3636
environment:
3737
- PYTHONUNBUFFERED=1
38+
- EXPAND_ABBREVIATIONS=${EXPAND_ABBREVIATIONS}
39+
40+
postgres:
41+
container_name: km_postgres
42+
restart: always
43+
image: postgres:latest
44+
environment:
45+
- POSTGRES_PASSWORD=supersecretpassword
46+
- POSTGRES_USER=kinderminer
47+
volumes:
48+
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
49+
- type: bind
50+
source: ./allie_data
51+
target: /var/lib/postgresql/data
52+
networks:
53+
- kmnet
54+
ports:
55+
- "5432:5432"
3856

3957
volumes:
4058
esdata01:

index_pubmed.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,13 @@
88
from xml.etree import ElementTree as ET
99
import re
1010
import ftplib
11+
import psycopg2
12+
import psycopg2.extras
13+
import urllib.request as urllib
1114
import pickle
1215

16+
EXPAND_ABBREVIATIONS = True if os.environ['EXPAND_ABBREVIATIONS'] == '1' else False
17+
1318
es = Elasticsearch(['es01:9200'])
1419

1520
def parse_cover_date(coverDate):
@@ -157,12 +162,16 @@ def update_mapping(index_name, type_name):
157162
return 0
158163

159164
class Helper():
165+
def __init__(self):
166+
self.conn = psycopg2.connect("dbname=%s user=%s password=%s host=%s port=%s" % \
167+
("kinderminer", "kinderminer", "supersecretpassword", "km_postgres", "5432"))
168+
self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
169+
160170
def get_metadata_from_xml(self, filepath):
161171
"""
162172
"""
163173
metadata = {}
164174

165-
166175
parser = ET.iterparse(filepath)
167176

168177
for event, element in parser:
@@ -271,6 +280,14 @@ def get_metadata_from_xml(self, filepath):
271280

272281
temp["metadata_update"] = datetime.datetime.now()
273282

283+
if EXPAND_ABBREVIATIONS:
284+
print("Checking for abbreviations")
285+
self.cur.execute("SELECT DISTINCT(short_form, long_form), short_form, long_form FROM alice_abbreviations WHERE pubmed_id=%(pmid)s",
286+
{"pmid" : temp["PMID"]})
287+
for abbr in self.cur:
288+
print("Abbreviation found!")
289+
temp["abstract_long_form"] = temp["abstract"].replace(abbr['short_form'], abbr['long_form'])
290+
274291
temp['time'] = [datetime.datetime.now()]
275292

276293
element.clear()
@@ -344,6 +361,33 @@ def update(self):
344361
updates_applied.add(update_file)
345362
pickle.dump(updates_applied, open("pubmed_updates_applied.p", "w"))
346363

364+
def download_allie():
365+
psql_fetching_conn = psycopg2.connect("dbname=%s user=%s password=%s host=%s port=%s" % \
366+
("kinderminer", "kinderminer", "supersecretpassword", "km_postgres", "5432"))
367+
cur = psql_fetching_conn.cursor()
368+
369+
update_file = 'alice_output_latest.txt.gz'
370+
print('ftp://ftp.dbcls.jp/allie/alice_output/%s' % update_file)
371+
urllib.urlretrieve('ftp://ftp.dbcls.jp/allie/alice_output/%s' % update_file, update_file)
372+
subprocess.call(["gunzip", '%s' % update_file])
373+
print("Cleaning up text")
374+
subprocess.call(["sed", "s/\\\\/\\\\\\\\/g", "-i", update_file.replace(".gz", "")])
375+
print("Copying into postgres")
376+
377+
# TODO: Need to make sure the table is there... but that can be done at the docker level
378+
379+
try:
380+
with open(update_file.replace(".gz", "")) as fin:
381+
cur.copy_from(fin, "alice_abbreviations")
382+
psql_fetching_conn.commit()
383+
#subprocess.call(["rm", update_file.replace(".gz", "")])
384+
except:
385+
print("Error copying %s" % update_file)
386+
print(sys.exc_info())
387+
psql_fetching_conn.commit()
388+
#subprocess.call(["rm", update_file.replace(".gz", "")])
389+
return 0
390+
347391
def main():
348392
parser = argparse.ArgumentParser(
349393
description="Utility for indexing PubMed abstracts into Elasticsearch to make them full-text searchable."
@@ -352,6 +396,10 @@ def main():
352396
parser.add_argument('--n_min', default=1, type=int, help='Minimum file number to process.')
353397
parser.add_argument('--n_max', default=1, type=int, help='Maximum file number to process.')
354398

399+
if EXPAND_ABBREVIATIONS:
400+
print("Downloading ALLIE abbreviation expansion database...")
401+
download_allie()
402+
355403
if not es.indices.exists("pubmed_abstracts"):
356404
es.indices.create("pubmed_abstracts")
357405
print("Waiting for ok status...")
@@ -367,6 +415,6 @@ def main():
367415
else:
368416
print("Invalid operation specified!")
369417
sys.exit(1)
370-
418+
#
371419
if __name__ == '__main__':
372420
main()

init.sql

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
--
2+
-- PostgreSQL database dump
3+
--
4+
5+
-- Dumped from database version 10.13
6+
-- Dumped by pg_dump version 10.13
7+
8+
SET statement_timeout = 0;
9+
SET lock_timeout = 0;
10+
SET idle_in_transaction_session_timeout = 0;
11+
SET client_encoding = 'UTF8';
12+
SET standard_conforming_strings = on;
13+
SELECT pg_catalog.set_config('search_path', '', false);
14+
SET check_function_bodies = false;
15+
SET xmloption = content;
16+
SET client_min_messages = warning;
17+
SET row_security = off;
18+
19+
SET default_tablespace = '';
20+
21+
SET default_with_oids = false;
22+
23+
--
24+
-- Name: alice_abbreviations; Type: TABLE; Schema: public; Owner: kinderminer
25+
--
26+
27+
CREATE TABLE public.alice_abbreviations (
28+
sequential_id integer,
29+
pubmed_id text,
30+
publication_year text,
31+
long_form_id integer,
32+
short_form_id integer,
33+
long_form text,
34+
short_form text
35+
);
36+
37+
38+
ALTER TABLE public.alice_abbreviations OWNER TO kinderminer;
39+
40+
--
41+
-- Name: alice_abbreviations_pubmed_id_idx; Type: INDEX; Schema: public; Owner: kinderminer
42+
--
43+
44+
CREATE INDEX alice_abbreviations_pubmed_id_idx ON public.alice_abbreviations USING btree (pubmed_id);
45+
46+
--
47+
-- PostgreSQL database dump complete
48+
--
49+

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
elasticsearch
22
python-dateutil
3+
psycopg2
4+
wait-for-it

0 commit comments

Comments
 (0)