Skip to content

Commit

Permalink
FHIR outputs
Browse files Browse the repository at this point in the history
Now produces FHIR outputs in addition to OWL and SemanticSQL.

General
- Update: Simplified CLI by importing & customizing OMOP2OWL-vocab CLI.
- Add: requirements.txt for versioned requirements.
- Update: Moved CLI to __main__ so that it is what you get when you run the package as a module.
- Update: Customized so that n3c_ingest.py now has prebaked configs
  • Loading branch information
joeflack4 committed Oct 3, 2023
1 parent 3665bf6 commit 2ff2500
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 102 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Run: `make all`

### Caveats
#### Memory requirements
Running with defaults takes somewhere between 28-50GB, and this only includes the "Subsumes" relationship type. There
Running with defaults takes somewhere between 28-50GB, and this only includes the "Is a" relationship type. There
are 411 total relationship types, thusly requiring more memory as you add more.

### CLI
Expand Down
1 change: 1 addition & 0 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------
all: n3c.db

# n3c.owl: Running `python n3c_ingest.n3c_ingest` uses default params, whears `python -m n3c_ingest` provides a CLI.
n3c.owl n3c.db: io/input/concept.csv | io/release/
python -m n3c_ingest --concept-csv-path io/input/concept.csv --concept-relationship-csv-path io/input/concept_relationship.csv --outdir io/release/

Expand Down
1 change: 0 additions & 1 deletion n3c_ingest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
"""N3C OMOP to OWL"""
from n3c_ingest.n3c_ingest import cli
5 changes: 3 additions & 2 deletions n3c_ingest/__main__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""N3C OMOP to OWL"""
from datetime import datetime

from n3c_ingest.n3c_ingest import cli
# noinspection PyProtectedMember
from omop2owl_vocab.omop2owl_vocab import cli


if __name__ == '__main__':
t1 = datetime.now()
cli()
cli('n3c-ingest', 'Convert N3C OMOP vocab tables to OWL, SemanticSQL, and FHIR.')
t2 = datetime.now()
print(f'Finished in {(t2 - t1).seconds} seconds')
117 changes: 19 additions & 98 deletions n3c_ingest/n3c_ingest.py
Original file line number Diff line number Diff line change
@@ -1,121 +1,42 @@
"""N3C ingest for TIMS."""
import os
from argparse import ArgumentParser
from datetime import datetime
from pathlib import Path

# noinspection PyProtectedMember
from omop2owl_vocab.omop2owl_vocab import _convert_semsql, _get_merged_file_outpath, _run_command, run
from omop2owl_vocab.omop2owl_vocab import omop2owl

SRC_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
PROJECT_DIR = SRC_DIR.parent
IO_DIR = PROJECT_DIR / 'io'
RELEASE_DIR = PROJECT_DIR / IO_DIR / 'release'
INPUT_DIR = PROJECT_DIR / IO_DIR / 'input'
TERMHUB_CSETS_DIR = INPUT_DIR / 'termhub-csets'
# DATASETS_DIR = TERMHUB_CSETS_DIR / 'datasets' / 'prepped_files'
DATASETS_DIR = INPUT_DIR
CONCEPT_CSV = DATASETS_DIR / 'concept.csv'
CONCEPT_RELATIONSHIP_CSV = DATASETS_DIR / 'concept_relationship.csv'
PROG = 'n3c-ingest'
DESC = 'Convert N3C OMOP vocab tables to OWL, SemanticSQL, and FHIR.'
CONCEPT_CSV = INPUT_DIR / 'concept.csv'
CONCEPT_RELATIONSHIP_CSV = INPUT_DIR / 'concept_relationship.csv'

for d in [RELEASE_DIR, INPUT_DIR]:
os.makedirs(d, exist_ok=True)


# TODO: keep this in sync with omop2owl-vocab
# - how to: copy/paste it, and then replace the concept and concept rel parts to default to the paths above (assuming makefile doesn't do this)
def cli():
"""Command line interface."""
parser = ArgumentParser(prog=PROG, description=DESC)
# Required
parser.add_argument(
'-c', '--concept-csv-path', required=False, help='Path to CSV of OMOP concept table.')
parser.add_argument(
'-r', '--concept-relationship-csv-path', required=False,
help='Path to CSV of OMOP concept_relationship table.')
# Optional
parser.add_argument(
'-O', '--outdir', required=False, default=os.getcwd(), help='Output directory.')
# todo:would be good to allow them to pass their own pURL
parser.add_argument(
'-I', '--ontology-id', required=False, default='OMOP', # add str(randint(100000, 999999))?
help='Identifier for ontology. Used to generate a pURL and file name.')
parser.add_argument(
'-o', '--output-type', required=False, default='merged-post-split',
choices=['merged', 'split', 'merged-post-split', 'rxnorm'],
help='What output to generate? If "merged" will create an ONTOLOGY_ID.db file with all concepts of all vocabs '
'merged into one. If "split" will create an ONTOLOGY_ID-*.db file for each vocab. "merged-post-split" '
'output will be as if running both "split" and "merged", but the merging implementation is different. '
'Use this option if running out of memory. If using "rxnorm", will create a specifically customized '
'ONTOLOGY_ID-RxNorm.db.')
parser.add_argument(
'-v', '--vocabs', required=False, nargs='+',
help='Used with `--output-type specific-vocabs-merged`. Which vocabularies to include in the output? Usage: '
'--vocabs "Procedure Type" "Device Type"')
parser.add_argument(
'-R', '--relationships', required=False, nargs='+', default=['Is a'],
help='Which relationship types from the concept_relationship table\'s relationship_id field to include? '
'Default is "Is a" only. Passing "ALL" includes everything. Ignored for --output-type options that are '
'specific to a pre-set vocabulary (e.g. rxnorm). Usage: --realationships "Is a" "Maps to"')
parser.add_argument(
'-S', '--skip-semsql', required=False, action='store_true',
help='In addition to .owl, also convert to a SemanticSQL .db? This is always True except when --output-type is '
'all-merged-post-split and it is creating initial .owl files to be merged.')
parser.add_argument(
'-e', '--exclude-singletons', required=False, action='store_true',
help='Exclude terms that do not have any relationships. This only applies to --method robot.')
parser.add_argument(
'-s', '--semsql-only', required=False, action='store_true',
help='Use this if the .owl already exists and you just want to create a SemanticSQL .db.')
parser.add_argument(
'-C', '--use-cache', required=False, action='store_true',
help='Of outputs or intermediates already exist, use them.')
parser.add_argument(
'-M', '--memory', required=False, default=100, help='The amount of Java memory (GB) to allocate.')
parser.add_argument('-i', '--install', action='store_true', help='Installs necessary docker images.')
# TODO: Provide customizations for runnig with preset relationships (i think want all vocabs)
# - got to look to our slack discussion
# - I won't be able to run the preset relationships. will run out of memory.
def skip_semsql_and_use_cache(
concept_csv_path: str = CONCEPT_CSV, concept_relationship_csv_path: str = CONCEPT_RELATIONSHIP_CSV,
outdir: str = RELEASE_DIR, use_cache: bool = True, skip_semsql: bool = True
):
"""Run ingest
# TODO: Need to switch to **kwargs for most of below
d = vars(parser.parse_args())
if d['install']:
_run_command('docker pull obolibrary/odkfull:dev')
print('Installation complete. Exiting.')
return
if not d['concept_csv_path'] or not d['concept_relationship_csv_path']:
raise RuntimeError('Must pass --concept-csv-path and --concept-relationship-csv-path')
if d['semsql_only']:
outpath: str = _get_merged_file_outpath(d['outdir'], d['ontology_id'], d['vocabs'])
_convert_semsql(outpath, memory=d['memory'])
elif d['output_type'] == 'split':
run(
concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'],
split_by_vocab=True, use_cache=d['use_cache'], skip_semsql=d['skip_semsql'],
exclude_singletons=d['exclude_singletons'], relationships=d['relationships'], vocabs=d['vocabs'],
memory=d['memory'], outdir=d['outdir'])
elif d['output_type'] == 'merged-post-split': # Default
run(
concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'],
split_by_vocab=True, split_by_vocab_merge_after=True, use_cache=d['use_cache'],
skip_semsql=d['skip_semsql'], exclude_singletons=d['exclude_singletons'], relationships=d['relationships'],
vocabs=d['vocabs'], memory=d['memory'], outdir=d['outdir'])
elif d['output_type'] == 'merged':
run(
concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'],
split_by_vocab=False, use_cache=d['use_cache'], skip_semsql=d['skip_semsql'], memory=d['memory'],
exclude_singletons=d['exclude_singletons'], relationships=d['relationships'], vocabs=d['vocabs'],
outdir=d['outdir'])
elif d['output_type'] == 'rxnorm':
# rxnorm_ingest(concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'])
run(
concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'],
split_by_vocab=True, vocabs=['RxNorm', 'ATC'], use_cache=d['use_cache'],
relationships=['Is a', 'Maps to', 'RxNorm inverse is a'], skip_semsql=d['skip_semsql'],
exclude_singletons=d['exclude_singletons'], memory=d['memory'], outdir=d['outdir'])
Running `python n3c_ingest.n3c_ingest` uses default params, whears `python -m n3c_ingest` provides a CLI."""
omop2owl(
concept_csv_path, concept_relationship_csv_path, use_cache=use_cache, skip_semsql=skip_semsql, outdir=outdir,
# todo: may change omop2owl for these to be tru eby default
split_by_vocab=True, split_by_vocab_merge_after=True, retain_robot_templates=False
)


# todo: might want to make a wrapper CLI for this where I have different pre-baked configs to run
if __name__ == '__main__':
t1 = datetime.now()
cli()
skip_semsql_and_use_cache()
t2 = datetime.now()
print(f'Finished in {(t2 - t1).seconds} seconds')
123 changes: 123 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
airium==0.2.6
annotated-types==0.5.0
antlr4-python3-runtime==4.9.3
appdirs==1.4.4
attrs==23.1.0
Babel==2.12.1
bcp47==0.0.4
beautifulsoup4==4.12.2
cattrs==23.1.2
certifi==2023.7.22
chardet==5.2.0
charset-normalizer==3.3.0
class-resolver==0.4.2
click==8.1.7
colorama==0.4.6
curies==0.6.4
Deprecated==1.2.14
deprecation==2.1.0
distlib==0.3.7
EditorConfig==0.12.3
eutils==0.6.0
exceptiongroup==1.1.3
fastobo==0.12.2
filelock==3.12.4
funowl==0.2.3
ghp-import==2.1.0
greenlet==2.0.1
hbreader==0.9.1
idna==3.4
ijson==3.2.3
importlib-metadata==6.8.0
iniconfig==2.0.0
isodate==0.6.1
Jinja2==3.1.2
jsbeautifier==1.14.9
json-flattener==0.1.9
jsonasobj==1.3.1
jsonasobj2==1.0.4
jsonschema==4.19.1
jsonschema-specifications==2023.7.1
kgcl-rdflib==0.5.0
kgcl-schema==0.6.0
lark==1.1.7
linkml-renderer==0.3.0
linkml-runtime==1.6.0
lxml==4.9.3
Markdown==3.4.4
MarkupSafe==2.1.3
mergedeep==1.3.4
mkdocs==1.5.3
mkdocs-material==9.4.2
mkdocs-material-extensions==1.2
mkdocs-mermaid2-plugin==0.6.0
more-click==0.1.2
ndex2==3.5.1
networkx==3.1
numpy==1.26.0
oaklib==0.5.20
ols-client==0.1.4
omop2owl-vocab==1.1.0
ontoportal-client==0.0.4
packaging==23.2
paginate==0.5.6
pandas==2.1.1
pansql==0.0.1
pathspec==0.11.2
pbr==5.11.1
platformdirs==3.10.0
pluggy==1.3.0
prefixcommons==0.1.12
prefixmaps==0.1.5
pronto==2.5.5
pydantic==2.4.2
pydantic_core==2.10.1
Pygments==2.16.1
PyJSG==0.11.10
pymdown-extensions==10.3
pyparsing==3.1.1
pysolr==3.9.0
pystow==0.5.0
pytest==7.4.2
pytest-logging==2015.11.4
python-dateutil==2.8.2
PyTrie==0.4.0
pytz==2023.3.post1
PyYAML==6.0.1
pyyaml_env_tag==0.1
ratelimit==2.2.1
rdflib==7.0.0
rdflib-jsonld==0.6.1
rdflib-shim==1.0.3
referencing==0.30.2
regex==2023.8.8
requests==2.31.0
requests-cache==1.1.0
requests-toolbelt==1.0.0
rfc3987==1.3.8
rpds-py==0.10.3
scipy==1.11.3
semsimian==0.2.1
semsql==0.3.2
six==1.16.0
sortedcontainers==2.4.0
soupsieve==2.5
SPARQLWrapper==2.0.0
SQLAlchemy==2.0.21
SQLAlchemy-Utils==0.38.3
sssom==0.3.41
sssom-schema==0.15.0
stevedore==5.1.0
tomli==2.0.1
tqdm==4.66.1
typing_extensions==4.8.0
tzdata==2023.3
url-normalize==1.4.3
urllib3==2.0.5
validators==0.22.0
virtualenv==20.24.5
virtualenv-clone==0.5.7
virtualenvwrapper==4.8.4
watchdog==3.0.0
wrapt==1.15.0
zipp==3.17.0

0 comments on commit 2ff2500

Please sign in to comment.