FHIR outputs

Now produces FHIR outputs in addition to OWL and SemanticSQL. General - Update: Simplified CLI by importing & customizing OMOP2OWL-vocab CLI. - Add: requirements.txt for versioned requirements. - Update: Moved CLI to __main__ so that it is what you get when you run the package as a module. - Update: Customized so that n3c_ingest.py now has prebaked configs
timsbiomed · Oct 3, 2023 · 2ff2500 · 2ff2500
1 parent 3665bf6
commit 2ff2500
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 102 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ Run: `make all`
 
 ### Caveats
 #### Memory requirements
-Running with defaults takes somewhere between 28-50GB, and this only includes the "Subsumes" relationship type. There 
+Running with defaults takes somewhere between 28-50GB, and this only includes the "Is a" relationship type. There 
 are 411 total relationship types, thusly requiring more memory as you add more.
 
 ### CLI

diff --git a/makefile b/makefile
@@ -5,6 +5,7 @@
 # MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------
 all: n3c.db
 
+# n3c.owl: Running `python n3c_ingest.n3c_ingest` uses default params, whears `python -m n3c_ingest` provides a CLI.
 n3c.owl n3c.db: io/input/concept.csv | io/release/
 	 python -m n3c_ingest --concept-csv-path io/input/concept.csv --concept-relationship-csv-path io/input/concept_relationship.csv --outdir io/release/
 

diff --git a/n3c_ingest/__init__.py b/n3c_ingest/__init__.py
@@ -1,2 +1 @@
 """N3C OMOP to OWL"""
-from n3c_ingest.n3c_ingest import cli
diff --git a/n3c_ingest/__main__.py b/n3c_ingest/__main__.py
@@ -1,11 +1,12 @@
 """N3C OMOP to OWL"""
 from datetime import datetime
 
-from n3c_ingest.n3c_ingest import cli
+# noinspection PyProtectedMember
+from omop2owl_vocab.omop2owl_vocab import cli
 
 
 if __name__ == '__main__':
     t1 = datetime.now()
-    cli()
+    cli('n3c-ingest', 'Convert N3C OMOP vocab tables to OWL, SemanticSQL, and FHIR.')
     t2 = datetime.now()
     print(f'Finished in {(t2 - t1).seconds} seconds')
diff --git a/n3c_ingest/n3c_ingest.py b/n3c_ingest/n3c_ingest.py
@@ -1,121 +1,42 @@
 """N3C ingest for TIMS."""
 import os
-from argparse import ArgumentParser
 from datetime import datetime
 from pathlib import Path
 
-# noinspection PyProtectedMember
-from omop2owl_vocab.omop2owl_vocab import _convert_semsql, _get_merged_file_outpath, _run_command, run
+from omop2owl_vocab.omop2owl_vocab import omop2owl
 
 SRC_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
 PROJECT_DIR = SRC_DIR.parent
 IO_DIR = PROJECT_DIR / 'io'
 RELEASE_DIR = PROJECT_DIR / IO_DIR / 'release'
 INPUT_DIR = PROJECT_DIR / IO_DIR / 'input'
-TERMHUB_CSETS_DIR = INPUT_DIR / 'termhub-csets'
-# DATASETS_DIR = TERMHUB_CSETS_DIR / 'datasets' / 'prepped_files'
-DATASETS_DIR = INPUT_DIR
-CONCEPT_CSV = DATASETS_DIR / 'concept.csv'
-CONCEPT_RELATIONSHIP_CSV = DATASETS_DIR / 'concept_relationship.csv'
-PROG = 'n3c-ingest'
-DESC = 'Convert N3C OMOP vocab tables to OWL, SemanticSQL, and FHIR.'
+CONCEPT_CSV = INPUT_DIR / 'concept.csv'
+CONCEPT_RELATIONSHIP_CSV = INPUT_DIR / 'concept_relationship.csv'
 
 for d in [RELEASE_DIR, INPUT_DIR]:
     os.makedirs(d, exist_ok=True)
 
 
-# TODO: keep this in sync with omop2owl-vocab
-#  - how to: copy/paste it, and then replace the concept and concept rel parts to default to the paths above (assuming makefile doesn't do this)
-def cli():
-    """Command line interface."""
-    parser = ArgumentParser(prog=PROG, description=DESC)
-    # Required
-    parser.add_argument(
-        '-c', '--concept-csv-path', required=False, help='Path to CSV of OMOP concept table.')
-    parser.add_argument(
-        '-r', '--concept-relationship-csv-path', required=False,
-        help='Path to CSV of OMOP concept_relationship table.')
-    # Optional
-    parser.add_argument(
-        '-O', '--outdir', required=False, default=os.getcwd(), help='Output directory.')
-    # todo:would be good to allow them to pass their own pURL
-    parser.add_argument(
-        '-I', '--ontology-id', required=False, default='OMOP',  # add str(randint(100000, 999999))?
-        help='Identifier for ontology. Used to generate a pURL and file name.')
-    parser.add_argument(
-        '-o', '--output-type', required=False, default='merged-post-split',
-        choices=['merged', 'split', 'merged-post-split', 'rxnorm'],
-        help='What output to generate? If "merged" will create an ONTOLOGY_ID.db file with all concepts of all vocabs '
-             'merged into one. If "split" will create an ONTOLOGY_ID-*.db file for each vocab. "merged-post-split" '
-             'output will be as if running both "split" and  "merged", but the merging implementation is different. '
-             'Use this option if running out of memory. If using "rxnorm", will create a specifically customized '
-             'ONTOLOGY_ID-RxNorm.db.')
-    parser.add_argument(
-        '-v', '--vocabs', required=False, nargs='+',
-        help='Used with `--output-type specific-vocabs-merged`. Which vocabularies to include in the output?  Usage: '
-             '--vocabs "Procedure Type" "Device Type"')
-    parser.add_argument(
-        '-R', '--relationships', required=False, nargs='+', default=['Is a'],
-        help='Which relationship types from the concept_relationship table\'s relationship_id field to include? '
-             'Default is "Is a" only. Passing "ALL" includes everything. Ignored for --output-type options that are '
-             'specific to a pre-set vocabulary (e.g. rxnorm). Usage: --realationships "Is a" "Maps to"')
-    parser.add_argument(
-        '-S', '--skip-semsql', required=False, action='store_true',
-        help='In addition to .owl, also convert to a SemanticSQL .db? This is always True except when --output-type is '
-             'all-merged-post-split and it is creating initial .owl files to be merged.')
-    parser.add_argument(
-        '-e', '--exclude-singletons', required=False, action='store_true',
-        help='Exclude terms that do not have any relationships. This only applies to --method robot.')
-    parser.add_argument(
-        '-s', '--semsql-only', required=False, action='store_true',
-        help='Use this if the .owl already exists and you just want to create a SemanticSQL .db.')
-    parser.add_argument(
-        '-C', '--use-cache', required=False, action='store_true',
-        help='Of outputs or intermediates already exist, use them.')
-    parser.add_argument(
-        '-M', '--memory', required=False, default=100, help='The amount of Java memory (GB) to allocate.')
-    parser.add_argument('-i', '--install', action='store_true', help='Installs necessary docker images.')
+# TODO: Provide customizations for runnig with preset relationships (i think want all vocabs)
+#  - got to look to our slack discussion
+#  - I won't be able to run the preset relationships. will run out of memory.
+def skip_semsql_and_use_cache(
+    concept_csv_path: str = CONCEPT_CSV, concept_relationship_csv_path: str = CONCEPT_RELATIONSHIP_CSV,
+    outdir: str = RELEASE_DIR, use_cache: bool = True, skip_semsql: bool = True
+):
+    """Run ingest
 
-    # TODO: Need to switch to **kwargs for most of below
-    d = vars(parser.parse_args())
-    if d['install']:
-        _run_command('docker pull obolibrary/odkfull:dev')
-        print('Installation complete. Exiting.')
-        return
-    if not d['concept_csv_path'] or not d['concept_relationship_csv_path']:
-        raise RuntimeError('Must pass --concept-csv-path and --concept-relationship-csv-path')
-    if d['semsql_only']:
-        outpath: str = _get_merged_file_outpath(d['outdir'], d['ontology_id'], d['vocabs'])
-        _convert_semsql(outpath, memory=d['memory'])
-    elif d['output_type'] == 'split':
-        run(
-            concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'],
-            split_by_vocab=True, use_cache=d['use_cache'], skip_semsql=d['skip_semsql'],
-            exclude_singletons=d['exclude_singletons'], relationships=d['relationships'], vocabs=d['vocabs'],
-            memory=d['memory'], outdir=d['outdir'])
-    elif d['output_type'] == 'merged-post-split':  # Default
-        run(
-            concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'],
-            split_by_vocab=True, split_by_vocab_merge_after=True, use_cache=d['use_cache'],
-            skip_semsql=d['skip_semsql'], exclude_singletons=d['exclude_singletons'], relationships=d['relationships'],
-            vocabs=d['vocabs'], memory=d['memory'], outdir=d['outdir'])
-    elif d['output_type'] == 'merged':
-        run(
-            concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'],
-            split_by_vocab=False, use_cache=d['use_cache'], skip_semsql=d['skip_semsql'], memory=d['memory'],
-            exclude_singletons=d['exclude_singletons'], relationships=d['relationships'], vocabs=d['vocabs'],
-            outdir=d['outdir'])
-    elif d['output_type'] == 'rxnorm':
-        # rxnorm_ingest(concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'])
-        run(
-            concept_csv_path=d['concept_csv_path'], concept_relationship_csv_path=d['concept_relationship_csv_path'],
-            split_by_vocab=True, vocabs=['RxNorm', 'ATC'], use_cache=d['use_cache'],
-            relationships=['Is a', 'Maps to', 'RxNorm inverse is a'], skip_semsql=d['skip_semsql'],
-            exclude_singletons=d['exclude_singletons'], memory=d['memory'], outdir=d['outdir'])
+    Running `python n3c_ingest.n3c_ingest` uses default params, whears `python -m n3c_ingest` provides a CLI."""
+    omop2owl(
+        concept_csv_path, concept_relationship_csv_path, use_cache=use_cache, skip_semsql=skip_semsql, outdir=outdir,
+        # todo: may change omop2owl for these to be tru eby default
+        split_by_vocab=True, split_by_vocab_merge_after=True, retain_robot_templates=False
+    )
 
 
+# todo: might want to make a wrapper CLI for this where I have different pre-baked configs to run
 if __name__ == '__main__':
     t1 = datetime.now()
-    cli()
+    skip_semsql_and_use_cache()
     t2 = datetime.now()
     print(f'Finished in {(t2 - t1).seconds} seconds')
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,123 @@
+airium==0.2.6
+annotated-types==0.5.0
+antlr4-python3-runtime==4.9.3
+appdirs==1.4.4
+attrs==23.1.0
+Babel==2.12.1
+bcp47==0.0.4
+beautifulsoup4==4.12.2
+cattrs==23.1.2
+certifi==2023.7.22
+chardet==5.2.0
+charset-normalizer==3.3.0
+class-resolver==0.4.2
+click==8.1.7
+colorama==0.4.6
+curies==0.6.4
+Deprecated==1.2.14
+deprecation==2.1.0
+distlib==0.3.7
+EditorConfig==0.12.3
+eutils==0.6.0
+exceptiongroup==1.1.3
+fastobo==0.12.2
+filelock==3.12.4
+funowl==0.2.3
+ghp-import==2.1.0
+greenlet==2.0.1
+hbreader==0.9.1
+idna==3.4
+ijson==3.2.3
+importlib-metadata==6.8.0
+iniconfig==2.0.0
+isodate==0.6.1
+Jinja2==3.1.2
+jsbeautifier==1.14.9
+json-flattener==0.1.9
+jsonasobj==1.3.1
+jsonasobj2==1.0.4
+jsonschema==4.19.1
+jsonschema-specifications==2023.7.1
+kgcl-rdflib==0.5.0
+kgcl-schema==0.6.0
+lark==1.1.7
+linkml-renderer==0.3.0
+linkml-runtime==1.6.0
+lxml==4.9.3
+Markdown==3.4.4
+MarkupSafe==2.1.3
+mergedeep==1.3.4
+mkdocs==1.5.3
+mkdocs-material==9.4.2
+mkdocs-material-extensions==1.2
+mkdocs-mermaid2-plugin==0.6.0
+more-click==0.1.2
+ndex2==3.5.1
+networkx==3.1
+numpy==1.26.0
+oaklib==0.5.20
+ols-client==0.1.4
+omop2owl-vocab==1.1.0
+ontoportal-client==0.0.4
+packaging==23.2
+paginate==0.5.6
+pandas==2.1.1
+pansql==0.0.1
+pathspec==0.11.2
+pbr==5.11.1
+platformdirs==3.10.0
+pluggy==1.3.0
+prefixcommons==0.1.12
+prefixmaps==0.1.5
+pronto==2.5.5
+pydantic==2.4.2
+pydantic_core==2.10.1
+Pygments==2.16.1
+PyJSG==0.11.10
+pymdown-extensions==10.3
+pyparsing==3.1.1
+pysolr==3.9.0
+pystow==0.5.0
+pytest==7.4.2
+pytest-logging==2015.11.4
+python-dateutil==2.8.2
+PyTrie==0.4.0
+pytz==2023.3.post1
+PyYAML==6.0.1
+pyyaml_env_tag==0.1
+ratelimit==2.2.1
+rdflib==7.0.0
+rdflib-jsonld==0.6.1
+rdflib-shim==1.0.3
+referencing==0.30.2
+regex==2023.8.8
+requests==2.31.0
+requests-cache==1.1.0
+requests-toolbelt==1.0.0
+rfc3987==1.3.8
+rpds-py==0.10.3
+scipy==1.11.3
+semsimian==0.2.1
+semsql==0.3.2
+six==1.16.0
+sortedcontainers==2.4.0
+soupsieve==2.5
+SPARQLWrapper==2.0.0
+SQLAlchemy==2.0.21
+SQLAlchemy-Utils==0.38.3
+sssom==0.3.41
+sssom-schema==0.15.0
+stevedore==5.1.0
+tomli==2.0.1
+tqdm==4.66.1
+typing_extensions==4.8.0
+tzdata==2023.3
+url-normalize==1.4.3
+urllib3==2.0.5
+validators==0.22.0
+virtualenv==20.24.5
+virtualenv-clone==0.5.7
+virtualenvwrapper==4.8.4
+watchdog==3.0.0
+wrapt==1.15.0
+zipp==3.17.0