Skip to content

Commit

Permalink
Merge pull request #2 from iross/abbrevs
Browse files Browse the repository at this point in the history
Abbreviation expansion
  • Loading branch information
iross authored Jul 22, 2020
2 parents ad015f2 + a3469bd commit 5ba8e54
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 3 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*data
3 changes: 3 additions & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# set to 1 to expand abbreviations via ALLIE (http://allie.dbcls.jp)

EXPAND_ABBREVIATIONS=1
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ It is also possible to ingest the daily update files provided by MEDLINE
(`ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/`). **BY DEFAULT, ALL UPDATE
FILES WILL BE APPLIED IN THIS MODE**

## Abbreviation expansion
Abberviation expansion is done via the ALLIE (http://allie.dbcls.jp) database.
By default, abbrevations are kept as-is from PubMed, but by changing the setting in `.env`
to

```
EXPAND_ABBREVIATIONS=1
```

The ALLIE database will be downloaded and installed into a postgres table. As the PubMed abstracts are ingested, this database is queried and any abbreviations found within the abstract are replaced with the long form, and the result is stored within the `abstract_long_form` field.

## Caveats
- The intended use is for testing of query logic, and the JVM options set for
Elasticsearch are set with this in mind.
Expand Down
20 changes: 19 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,29 @@ services:
build: .
networks:
- kmnet
command: "/bin/bash ./wait_for_it.sh -t 0 es01:9200 -- python index_pubmed.py bulk --n_min 1 --n_max 1"
command: "wait-for-it -s es01:9200 -s km_postgres:5432 -- python index_pubmed.py bulk --n_min 1 --n_max 1"
depends_on:
- es01
environment:
- PYTHONUNBUFFERED=1
- EXPAND_ABBREVIATIONS=${EXPAND_ABBREVIATIONS}

postgres:
container_name: km_postgres
restart: always
image: postgres:latest
environment:
- POSTGRES_PASSWORD=supersecretpassword
- POSTGRES_USER=kinderminer
volumes:
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
- type: bind
source: ./allie_data
target: /var/lib/postgresql/data
networks:
- kmnet
ports:
- "5432:5432"

volumes:
esdata01:
Expand Down
52 changes: 50 additions & 2 deletions index_pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,13 @@
from xml.etree import ElementTree as ET
import re
import ftplib
import psycopg2
import psycopg2.extras
import urllib.request as urllib
import pickle

EXPAND_ABBREVIATIONS = True if os.environ['EXPAND_ABBREVIATIONS'] == '1' else False

es = Elasticsearch(['es01:9200'])

def parse_cover_date(coverDate):
Expand Down Expand Up @@ -157,12 +162,16 @@ def update_mapping(index_name, type_name):
return 0

class Helper():
def __init__(self):
self.conn = psycopg2.connect("dbname=%s user=%s password=%s host=%s port=%s" % \
("kinderminer", "kinderminer", "supersecretpassword", "km_postgres", "5432"))
self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

def get_metadata_from_xml(self, filepath):
"""
"""
metadata = {}


parser = ET.iterparse(filepath)

for event, element in parser:
Expand Down Expand Up @@ -271,6 +280,14 @@ def get_metadata_from_xml(self, filepath):

temp["metadata_update"] = datetime.datetime.now()

if EXPAND_ABBREVIATIONS:
print("Checking for abbreviations")
self.cur.execute("SELECT DISTINCT(short_form, long_form), short_form, long_form FROM alice_abbreviations WHERE pubmed_id=%(pmid)s",
{"pmid" : temp["PMID"]})
for abbr in self.cur:
print("Abbreviation found!")
temp["abstract_long_form"] = temp["abstract"].replace(abbr['short_form'], abbr['long_form'])

temp['time'] = [datetime.datetime.now()]

element.clear()
Expand Down Expand Up @@ -344,6 +361,33 @@ def update(self):
updates_applied.add(update_file)
pickle.dump(updates_applied, open("pubmed_updates_applied.p", "w"))

def download_allie():
psql_fetching_conn = psycopg2.connect("dbname=%s user=%s password=%s host=%s port=%s" % \
("kinderminer", "kinderminer", "supersecretpassword", "km_postgres", "5432"))
cur = psql_fetching_conn.cursor()

update_file = 'alice_output_latest.txt.gz'
print('ftp://ftp.dbcls.jp/allie/alice_output/%s' % update_file)
urllib.urlretrieve('ftp://ftp.dbcls.jp/allie/alice_output/%s' % update_file, update_file)
subprocess.call(["gunzip", '%s' % update_file])
print("Cleaning up text")
subprocess.call(["sed", "s/\\\\/\\\\\\\\/g", "-i", update_file.replace(".gz", "")])
print("Copying into postgres")

# TODO: Need to make sure the table is there... but that can be done at the docker level

try:
with open(update_file.replace(".gz", "")) as fin:
cur.copy_from(fin, "alice_abbreviations")
psql_fetching_conn.commit()
#subprocess.call(["rm", update_file.replace(".gz", "")])
except:
print("Error copying %s" % update_file)
print(sys.exc_info())
psql_fetching_conn.commit()
#subprocess.call(["rm", update_file.replace(".gz", "")])
return 0

def main():
parser = argparse.ArgumentParser(
description="Utility for indexing PubMed abstracts into Elasticsearch to make them full-text searchable."
Expand All @@ -352,6 +396,10 @@ def main():
parser.add_argument('--n_min', default=1, type=int, help='Minimum file number to process.')
parser.add_argument('--n_max', default=1, type=int, help='Maximum file number to process.')

if EXPAND_ABBREVIATIONS:
print("Downloading ALLIE abbreviation expansion database...")
download_allie()

if not es.indices.exists("pubmed_abstracts"):
es.indices.create("pubmed_abstracts")
print("Waiting for ok status...")
Expand All @@ -367,6 +415,6 @@ def main():
else:
print("Invalid operation specified!")
sys.exit(1)

#
if __name__ == '__main__':
main()
49 changes: 49 additions & 0 deletions init.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
--
-- PostgreSQL database dump
--

-- Dumped from database version 10.13
-- Dumped by pg_dump version 10.13

SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET xmloption = content;
SET client_min_messages = warning;
SET row_security = off;

SET default_tablespace = '';

SET default_with_oids = false;

--
-- Name: alice_abbreviations; Type: TABLE; Schema: public; Owner: kinderminer
--

CREATE TABLE public.alice_abbreviations (
sequential_id integer,
pubmed_id text,
publication_year text,
long_form_id integer,
short_form_id integer,
long_form text,
short_form text
);


ALTER TABLE public.alice_abbreviations OWNER TO kinderminer;

--
-- Name: alice_abbreviations_pubmed_id_idx; Type: INDEX; Schema: public; Owner: kinderminer
--

CREATE INDEX alice_abbreviations_pubmed_id_idx ON public.alice_abbreviations USING btree (pubmed_id);

--
-- PostgreSQL database dump complete
--

2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
elasticsearch
python-dateutil
psycopg2
wait-for-it

0 comments on commit 5ba8e54

Please sign in to comment.