diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..dc6cc59 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +*data diff --git a/.env b/.env new file mode 100644 index 0000000..8180890 --- /dev/null +++ b/.env @@ -0,0 +1,3 @@ +# set to 1 to expand abbreviations via ALLIE (http://allie.dbcls.jp) + +EXPAND_ABBREVIATIONS=1 diff --git a/README.md b/README.md index 90cc898..18e890e 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,17 @@ It is also possible to ingest the daily update files provided by MEDLINE (`ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/`). **BY DEFAULT, ALL UPDATE FILES WILL BE APPLIED IN THIS MODE** +## Abbreviation expansion +Abberviation expansion is done via the ALLIE (http://allie.dbcls.jp) database. +By default, abbrevations are kept as-is from PubMed, but by changing the setting in `.env` +to + +``` +EXPAND_ABBREVIATIONS=1 +``` + +The ALLIE database will be downloaded and installed into a postgres table. As the PubMed abstracts are ingested, this database is queried and any abbreviations found within the abstract are replaced with the long form, and the result is stored within the `abstract_long_form` field. + ## Caveats - The intended use is for testing of query logic, and the JVM options set for Elasticsearch are set with this in mind. diff --git a/docker-compose.yml b/docker-compose.yml index a87e46b..9452e45 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,12 +30,12 @@ services: build: . networks: - kmnet -# command: "tail -f /dev/null" - command: "wait-for-it -s es01:9200 -s km_postgres:5432 -- python index_pubmed.py bulk --n_min 1 --n_max 5" + command: "wait-for-it -s es01:9200 -s km_postgres:5432 -- python index_pubmed.py bulk --n_min 1 --n_max 1" depends_on: - es01 environment: - PYTHONUNBUFFERED=1 + - EXPAND_ABBREVIATIONS=${EXPAND_ABBREVIATIONS} postgres: container_name: km_postgres diff --git a/index_pubmed.py b/index_pubmed.py index 9f6d5cd..6c84fa6 100644 --- a/index_pubmed.py +++ b/index_pubmed.py @@ -13,7 +13,7 @@ import urllib.request as urllib import pickle -DO_ABBREVIATIONS = True +EXPAND_ABBREVIATIONS = True if os.environ['EXPAND_ABBREVIATIONS'] == '1' else False es = Elasticsearch(['es01:9200']) @@ -280,7 +280,7 @@ def get_metadata_from_xml(self, filepath): temp["metadata_update"] = datetime.datetime.now() - if DO_ABBREVIATIONS: + if EXPAND_ABBREVIATIONS: print("Checking for abbreviations") self.cur.execute("SELECT DISTINCT(short_form, long_form), short_form, long_form FROM alice_abbreviations WHERE pubmed_id=%(pmid)s", {"pmid" : temp["PMID"]}) @@ -396,10 +396,9 @@ def main(): parser.add_argument('--n_min', default=1, type=int, help='Minimum file number to process.') parser.add_argument('--n_max', default=1, type=int, help='Maximum file number to process.') - - # TODO: pass + do in abbreviation embiggening - #if DO_ABBREVIATIONS: - #download_allie() + if EXPAND_ABBREVIATIONS: + print("Downloading ALLIE abbreviation expansion database...") + download_allie() if not es.indices.exists("pubmed_abstracts"): es.indices.create("pubmed_abstracts")