Add abbreviation expansion toggle

iross · Jul 22, 2020 · a3469bd · a3469bd
1 parent f8d42e8
commit a3469bd
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 8 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+*data
diff --git a/.env b/.env
@@ -0,0 +1,3 @@
+# set to 1 to expand abbreviations via ALLIE (http://allie.dbcls.jp)
+
+EXPAND_ABBREVIATIONS=1
diff --git a/README.md b/README.md
@@ -40,6 +40,17 @@ It is also possible to ingest the daily update files provided by MEDLINE
 (`ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/`). **BY DEFAULT, ALL UPDATE
 FILES WILL BE APPLIED IN THIS MODE**
 
+## Abbreviation expansion
+Abberviation expansion is done via the ALLIE (http://allie.dbcls.jp) database.
+By default, abbrevations are kept as-is from PubMed, but by changing the setting in `.env`
+to 
+
+```
+EXPAND_ABBREVIATIONS=1
+```
+
+The ALLIE database will be downloaded and installed into a postgres table. As the PubMed abstracts are ingested, this database is queried and any abbreviations found within the abstract are replaced with the long form, and the result is stored within the `abstract_long_form` field.
+
 ## Caveats
 - The intended use is for testing of query logic, and the JVM options set for
   Elasticsearch are set with this in mind.

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -30,12 +30,12 @@ services:
       build: .
       networks:
         - kmnet
-#      command: "tail -f /dev/null"
-      command: "wait-for-it -s es01:9200 -s km_postgres:5432 -- python index_pubmed.py bulk --n_min 1 --n_max 5"
+      command: "wait-for-it -s es01:9200 -s km_postgres:5432 -- python index_pubmed.py bulk --n_min 1 --n_max 1"
       depends_on:
         - es01
       environment:
         - PYTHONUNBUFFERED=1
+        - EXPAND_ABBREVIATIONS=${EXPAND_ABBREVIATIONS}
 
   postgres:
       container_name: km_postgres

diff --git a/index_pubmed.py b/index_pubmed.py
@@ -13,7 +13,7 @@
 import urllib.request as urllib
 import pickle
 
-DO_ABBREVIATIONS = True
+EXPAND_ABBREVIATIONS = True if os.environ['EXPAND_ABBREVIATIONS'] == '1' else False
 
 es = Elasticsearch(['es01:9200'])
 
@@ -280,7 +280,7 @@ def get_metadata_from_xml(self, filepath):
 
                 temp["metadata_update"] = datetime.datetime.now()
 
-                if DO_ABBREVIATIONS:
+                if EXPAND_ABBREVIATIONS:
                     print("Checking for abbreviations")
                     self.cur.execute("SELECT DISTINCT(short_form, long_form), short_form, long_form FROM alice_abbreviations WHERE pubmed_id=%(pmid)s",
                             {"pmid" : temp["PMID"]})
@@ -396,10 +396,9 @@ def main():
     parser.add_argument('--n_min', default=1, type=int, help='Minimum file number to process.')
     parser.add_argument('--n_max', default=1, type=int, help='Maximum file number to process.')
 
-
-    # TODO: pass + do in abbreviation embiggening
-    #if DO_ABBREVIATIONS:
-        #download_allie()
+    if EXPAND_ABBREVIATIONS:
+        print("Downloading ALLIE abbreviation expansion database...")
+        download_allie()
 
     if not es.indices.exists("pubmed_abstracts"):
         es.indices.create("pubmed_abstracts")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# set to 1 to expand abbreviations via ALLIE (http://allie.dbcls.jp)

		EXPAND_ABBREVIATIONS=1