ORDO Mappings

- Delete: Unnecessary files and goals: temp analysis, getting of ordo.owl and querying it via SPARQL. - Add: SSSOM validation - Update: Replaced ad hoc SSSOM utilities with the ones from sssom-py - Add: mondo-icd11 mappings
monarch-initiative · Apr 15, 2024 · 624aa09 · 624aa09
1 parent 12dc3d1
commit 624aa09
Show file tree

Hide file tree

Showing 8 changed files with 167 additions and 136 deletions.
diff --git a/config/icd11.sssom-metadata.yml b/config/icd11.sssom-metadata.yml
@@ -1,12 +1,15 @@
-creator_id: 0000-0002-2906-7319
+creator_id: orcid:0000-0002-2906-7319
 curie_map:
-  Orphanet: http://www.orpha.net/ORDO/Orphanet_
   icd11.foundation: http://id.who.int/icd/entity/
+  MONDO: http://purl.obolibrary.org/obo/MONDO_
   oboInOwl: http://www.geneontology.org/formats/oboInOwl#
+  orcid: https://orcid.org/
   owl: http://www.w3.org/2002/07/owl#
+  Orphanet: http://www.orpha.net/ORDO/Orphanet_
   rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
   rdfs: http://www.w3.org/2000/01/rdf-schema#
   semapv: https://w3id.org/semapv/
   skos: http://www.w3.org/2004/02/skos/core#
   sssom: https://w3id.org/sssom/
 license: http://w3id.org/sssom/license/unspecified
+mapping_provider: https://www.orpha.net/
diff --git a/makefile b/makefile
@@ -1,12 +1,19 @@
 .DEFAULT_GOAL := all
-.PHONY: all release clean
+.PHONY: all release clean ontology mappings mappings-validate help
 TODAY ?=$(shell date +%Y-%m-%d)
 VERSION=v$(TODAY)
 SOURCE_URL=https://icd11files.blob.core.windows.net/tmp/whofic-2023-04-08.owl.gz
 
 
 # MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------
-all: tmp/output/release/icd11foundation.owl tmp/output/release/ordo-icd11.sssom.tsv
+all: ontology mappings
+
+ontology: tmp/output/release/icd11foundation.owl
+
+mappings: mappings-validate
+
+mappings-validate: tmp/output/release/ordo-icd11.sssom.tsv
+	sssom validate $<
 
 clean:
 	rm -rf tmp/
@@ -37,22 +44,34 @@ release: | tmp/output/release/
 	gh release create $(VERSION) --notes "New release." --title "$(VERSION)" tmp/output/release/*
 
 # Mappings
+# todo: I'd like to do USE_PREBUILT=false, but this error is occuring:
+#  cp: cannot stat 'tmp/input/mondo/src/ontology/mappings/mondo.sssom.tsv': No such file or directory
+#  ...even though I checked and the file at that path does exist.
+USE_PREBUILT=true
+
+tmp/input/mondo/:
+	rm -rf $@ &&\
+	cd tmp/input/ &&\
+	git clone --depth 1 https://github.com/monarch-initiative/mondo
+
+tmp/input/mondo.sssom.tsv: tmp/input/mondo/
+	if [ $(USE_PREBUILT) = true ]; then wget https://raw.githubusercontent.com/monarch-initiative/mondo/master/src/ontology/mappings/mondo.sssom.tsv -O $@; else cd tmp/input/ &&\
+		cd mondo/src/ontology &&\
+		make mondo.owl mappings -B MIR=false IMP=false MIR=false &&\
+		cp tmp/input/mondo/src/ontology/mappings/mondo.sssom.tsv $@; fi
+
 # todo: Stable URI/filename issue: https://github.com/monarch-initiative/icd11/pull/12#discussion_r1542187711
 tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml: | tmp/input/
 	wget https://www.orphadata.com/data/nomenclature/packs/Orphanet_Nomenclature_Pack_EN.zip -O tmp/input/Orphanet_Nomenclature_Pack_EN.zip
 	unzip tmp/input/Orphanet_Nomenclature_Pack_EN.zip -d tmp/input/Orphanet_Nomenclature_Pack_EN
 
-tmp/input/ordo.owl: | tmp/input/
-	wget http://www.orphadata.org/data/ORDO/ordo_orphanet.owl -O $@
-
-tmp/output/icd11mms-exact-matches.tsv: tmp/input/ordo.owl
-	robot query -i $< --query src/icd11mms-exact-matches.sparql $@
-
-tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml tmp/output/icd11mms-exact-matches.tsv | tmp/output/release/
+tmp/output/release/mondo-icd11.sssom.tsv tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml tmp/input/mondo.sssom.tsv | tmp/output/release/
 	python3 src/mappings.py \
 	--input-nomenclature-xml tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml \
 	--input-sssom-config config/icd11.sssom-metadata.yml \
-	--outpath $@
+	--input-mondo-sssom tmp/input/mondo.sssom.tsv \
+	--outpath-ordo-mappings tmp/output/release/ordo-icd11.sssom.tsv \
+	--outpath-mondo-mappings tmp/output/release/mondo-icd11.sssom.tsv
 
 # HELP -----------------------------------------------------------------------------------------------------------------
 help:

diff --git a/requirements-unlocked.txt b/requirements-unlocked.txt
@@ -1,2 +1,3 @@
 pandas
 pyyaml
+sssom
diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,60 @@
-distlib==0.3.6
-filelock==3.9.0
-numpy==1.25.1
-pandas==2.0.3
-pbr==5.11.1
-platformdirs==3.1.0
-python-dateutil==2.8.2
-pytz==2023.3
+annotated-types==0.6.0
+attrs==23.2.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+curies==0.7.9
+Deprecated==1.2.14
+deprecation==2.1.0
+distlib==0.3.8
+exceptiongroup==1.2.0
+filelock==3.13.1
+hbreader==0.9.1
+idna==3.6
+importlib_resources==6.4.0
+iniconfig==2.0.0
+isodate==0.6.1
+json-flattener==0.1.9
+jsonasobj2==1.0.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+linkml-runtime==1.7.5
+networkx==3.3
+numpy==1.26.4
+packaging==24.0
+pandas==2.2.1
+pansql==0.0.1
+pbr==6.0.0
+platformdirs==4.2.0
+pluggy==1.4.0
+prefixcommons==0.1.12
+prefixmaps==0.2.3
+pydantic==2.6.4
+pydantic_core==2.16.3
+pyparsing==3.1.2
+pytest==8.1.1
+pytest-logging==2015.11.4
+python-dateutil==2.9.0.post0
+PyTrie==0.4.0
+pytz==2024.1
 PyYAML==6.0.1
+rdflib==7.0.0
+referencing==0.34.0
+requests==2.31.0
+rpds-py==0.18.0
+scipy==1.13.0
 six==1.16.0
-stevedore==5.0.0
-tzdata==2023.3
-virtualenv==20.20.0
+sortedcontainers==2.4.0
+SPARQLWrapper==2.0.0
+SQLAlchemy==2.0.29
+sssom==0.4.6
+sssom-schema==0.15.2
+stevedore==5.1.0
+tomli==2.0.1
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+validators==0.28.0
+virtualenv==20.25.0
 virtualenv-clone==0.5.7
+wrapt==1.16.0
diff --git a/src/icd11mms-exact-matches.sparql b/src/icd11mms-exact-matches.sparql
diff --git a/src/mappings.py b/src/mappings.py
@@ -1,4 +1,9 @@
-"""Extract mappings"""
+"""Extract mappings
+
+todo's (minor):
+ 1. icd11.sssom-metadata.yml: Generalize it by removing the 'mapping_provider' field, which links to Orphanet, and add
+ that in dynamically just for these ORDO mappings.
+"""
 from argparse import ArgumentParser
 from pathlib import Path
 import xml.etree.ElementTree as eleTree
@@ -13,13 +18,15 @@
 INPUT_DIR = PROJECT_DIR / "tmp" / "input"
 INTERMEDIATES_DIR = PROJECT_DIR / "tmp" / "output"
 RELEASE_DIR = INTERMEDIATES_DIR / "release"
-INPUT_NOMENCLATURE_XML = str(INPUT_DIR / "Orphanet_Nomenclature_Pack_EN" / "ORPHA_ICD11_mapping_en_newversion_2023.xml")
-INPUT_MMS_MATCHES_TSV = str(INTERMEDIATES_DIR / "icd11mms-exact-matches.tsv")
+INPATH_NOMENCLATURE_XML = \
+    str(INPUT_DIR / "Orphanet_Nomenclature_Pack_EN" / "ORPHA_ICD11_mapping_en_newversion_2023.xml")
 INPUT_CONFIG = str(CONFIG_DIR / "icd11.sssom-metadata.yml")
-OUTPUT_FILE = str(RELEASE_DIR / "ordo-icd11.sssom.tsv")
+INPUT_MONDO_SSSOM = str(INPUT_DIR / 'mondo.sssom.tsv')
+OUTPATH_ORDO = str(RELEASE_DIR / "ordo-icd11.sssom.tsv")
+OUTPATH_MONDO = str(RELEASE_DIR / "mondo-icd11.sssom.tsv")
 
 
-def xml_as_df(file_path: str = INPUT_NOMENCLATURE_XML) -> pd.DataFrame:
+def xml_as_df(file_path: str = INPATH_NOMENCLATURE_XML) -> pd.DataFrame:
     """Parses XML and gets DF with the fields we care about.
 
     Code source: https://chat.openai.com/share/1daaa4bc-3f6b-4379-b496-f88eff6a4ba0
@@ -108,11 +115,12 @@ def xml_as_df(file_path: str = INPUT_NOMENCLATURE_XML) -> pd.DataFrame:
     df = pd.DataFrame(data)
     return df
 
-def run(
-    input_nomenclature_xml: str = INPUT_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
-    outpath: str = OUTPUT_FILE
-):
-    """Run"""
+
+def ordo_sssom(
+    input_nomenclature_xml: str = INPATH_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
+    outpath_ordo_mappings: str = OUTPATH_ORDO,
+) -> pd.DataFrame:
+    """Create ORDO SSSOM mappings"""
     # Get data
     df: pd.DataFrame = xml_as_df(input_nomenclature_xml)
 
@@ -131,7 +139,44 @@ def run(
 
     # Get only columns we care about
     df = df[['subject_id', 'subject_label', 'predicate_id', 'object_id']]
-    write_sssom(df, input_sssom_config, outpath)
+
+    # Save & return
+    write_sssom(df, input_sssom_config, outpath_ordo_mappings)
+    return df
+
+
+def mondo_sssom(
+    df: pd.DataFrame, input_mondo_sssom: str = INPUT_MONDO_SSSOM, input_sssom_config: str = INPUT_CONFIG,
+    outpath_mondo_mappings: str = OUTPATH_MONDO,
+) -> pd.DataFrame:
+    """Create Mondo SSSOM mappings"""
+    mondo_sssom_df = pd.read_csv(input_mondo_sssom, sep='\t', comment='#')
+    mondo_sssom_df['object_prefix'] = mondo_sssom_df['object_id'].apply(lambda x: x.split(':')[0])
+    mondo_sssom_df = mondo_sssom_df[
+        (mondo_sssom_df['predicate_id'] == 'skos:exactMatch') &
+        (mondo_sssom_df['object_prefix'] == 'Orphanet')
+    ].rename(
+        columns={'object_id': 'ordo_id'}
+    )[['subject_id', 'subject_label', 'ordo_id']]
+
+    df2 = df.rename(columns={'subject_id': 'ordo_id', 'subject_label': 'ordo_label'})\
+        .merge(mondo_sssom_df, how='inner', on='ordo_id')[
+        ['subject_id', 'subject_label', 'predicate_id', 'object_id']]
+
+    # Save & return
+    write_sssom(df2, input_sssom_config, outpath_mondo_mappings)
+    return df2
+
+
+def run(
+    input_nomenclature_xml: str = INPATH_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
+    input_mondo_sssom: str = INPUT_MONDO_SSSOM, outpath_ordo_mappings: str = OUTPATH_ORDO,
+    outpath_mondo_mappings: str = OUTPATH_MONDO
+):
+    """Run"""
+    # Get data
+    df: pd.DataFrame = ordo_sssom(input_nomenclature_xml, input_sssom_config, outpath_ordo_mappings)
+    mondo_sssom(df, input_mondo_sssom, input_sssom_config, outpath_mondo_mappings)
 
 
 def cli():
@@ -140,12 +185,17 @@ def cli():
         prog='Create SSSOM outputs',
         description='Create SSSOM outputs from Orphanet source')
     parser.add_argument(
-        '-n', '--input-nomenclature-xml', default=INPUT_NOMENCLATURE_XML,
+        '-n', '--input-nomenclature-xml', default=INPATH_NOMENCLATURE_XML,
         help='Path to ICD11 mapping XML file from the Orphanet nomenclature pack.')
     parser.add_argument(
         '-c', '--input-sssom-config', default=INPUT_CONFIG, help='Path to SSSOM config yml.')
     parser.add_argument(
-        '-o', '--outpath', default=OUTPUT_FILE, help='Path to save SSSOM TSV.')
+        '-s', '--input-mondo-sssom', default=INPUT_MONDO_SSSOM, help='Path to mondo.sssom.tsv.')
+    parser.add_argument(
+        '-o', '--outpath-ordo-mappings', default=OUTPATH_ORDO, help='Path to save ORDO->ICD11 SSSOM TSV.')
+    parser.add_argument(
+        '-m', '--outpath-mondo-mappings', default=OUTPATH_MONDO,
+        help='Path to save MONDO->ORDO->ICD11 SSSOM TSV.')
     run(**vars(parser.parse_args()))
 
 

diff --git a/src/temp_compare_matches_owl_and_nomenclature.py b/src/temp_compare_matches_owl_and_nomenclature.py
diff --git a/src/utils.py b/src/utils.py
@@ -1,56 +1,20 @@
 """Utilities"""
-import os
 from pathlib import Path
-from typing import Set, Union
+from typing import Dict, Union
 
+import curies
 import pandas as pd
 import yaml
+from sssom import MappingSetDataFrame
+from sssom.writers import write_table
 
 
-def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]:
-    """Find prefixes in mapping set"""
-    df = source_df.copy()
-    cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id']
-    prefixes = set()
-    for col in cols_with_prefixes:
-        col2 = col.replace('id', 'prefix')
-        df[col2] = df[col].apply(lambda x: x.split(':')[0]
-            if isinstance(x, str) else x)  # handles nan
-        prefixes.update(set(df[col2].to_list()))
-    return prefixes
-
-
+# todo: Add to sssom-py. Shared between, at the least, ICD11 and MedGen repos
 def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]):
-    """Writes a SSSOM file with commented metadata at the top of the file.
-
-    Filters only prefxes in curie_map that exist in the mapping set."""
-    temp_filtered_config_path = str(config_path) + '.tmp'
-    # Load config
-    config = yaml.safe_load(open(config_path, 'r'))
-    # Filter curie_map
-    prefixes: Set[str] = find_prefixes_in_mapping_set(df)
-    config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes}
-    # Write
-    with open(temp_filtered_config_path, 'w') as f:
-        yaml.dump(config, f)
-    write_tsv_with_comments(df, temp_filtered_config_path, outpath)
-    os.remove(temp_filtered_config_path)
-
-
-def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]):
-    """Write a TSV with comments at the top"""
-    # write metadata
-    f = open(comments_file, "r")
-    lines = f.readlines()
-    f.close()
-    output_lines = []
-    for line in lines:
-        output_lines.append("# " + line)
-    metadata_str = ''.join(output_lines)
-    if os.path.exists(outpath):
-        os.remove(outpath)
-    f = open(outpath, 'a')
-    f.write(metadata_str)
-    f.close()
-    # write data
-    df.to_csv(outpath, index=False, sep='\t', mode='a')
+    """Writes a SSSOM file"""
+    with open(config_path, 'r') as yaml_file:
+        metadata: Dict = yaml.load(yaml_file, Loader=yaml.FullLoader)
+    converter = curies.Converter.from_prefix_map(metadata['curie_map'])
+    msdf: MappingSetDataFrame = MappingSetDataFrame(converter=converter, df=df, metadata=metadata)
+    with open(outpath, 'w') as f:
+        write_table(msdf, f)
-Original file line number
+Diff line change
@@ -1,2 +1,3 @@
     pandas
     pyyaml
+    sssom