From 765587772bdddf958d99dbe785e5ec0c18d3aa99 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Sat, 6 Apr 2024 18:27:30 -0400 Subject: [PATCH] ORDO Mappings - Delete: Unnecessary files and goals: temp analysis, getting of ordo.owl and querying it via SPARQL. - Add: SSSOM validation - Update: Replaced ad hoc SSSOM utilities with the ones from sssom-py --- config/icd11.sssom-metadata.yml | 4 +- makefile | 19 +++--- requirements-unlocked.txt | 1 + requirements.txt | 68 ++++++++++++++++--- src/icd11mms-exact-matches.sparql | 22 ------ src/mappings.py | 23 +++++-- ...mp_compare_matches_owl_and_nomenclature.py | 30 -------- 7 files changed, 90 insertions(+), 77 deletions(-) delete mode 100644 src/icd11mms-exact-matches.sparql delete mode 100644 src/temp_compare_matches_owl_and_nomenclature.py diff --git a/config/icd11.sssom-metadata.yml b/config/icd11.sssom-metadata.yml index 61fb5a3..8d656bd 100644 --- a/config/icd11.sssom-metadata.yml +++ b/config/icd11.sssom-metadata.yml @@ -1,8 +1,9 @@ -creator_id: 0000-0002-2906-7319 +creator_id: orcid:0000-0002-2906-7319 curie_map: Orphanet: http://www.orpha.net/ORDO/Orphanet_ icd11.foundation: http://id.who.int/icd/entity/ oboInOwl: http://www.geneontology.org/formats/oboInOwl# + orcid: https://orcid.org/ owl: http://www.w3.org/2002/07/owl# rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# rdfs: http://www.w3.org/2000/01/rdf-schema# @@ -10,3 +11,4 @@ curie_map: skos: http://www.w3.org/2004/02/skos/core# sssom: https://w3id.org/sssom/ license: http://w3id.org/sssom/license/unspecified +mapping_provider: https://www.orpha.net/ diff --git a/makefile b/makefile index 76d3244..58ee7bf 100644 --- a/makefile +++ b/makefile @@ -1,12 +1,19 @@ .DEFAULT_GOAL := all -.PHONY: all release clean +.PHONY: all release clean ontology mappings mappings-validate help TODAY ?=$(shell date +%Y-%m-%d) VERSION=v$(TODAY) SOURCE_URL=https://icd11files.blob.core.windows.net/tmp/whofic-2023-04-08.owl.gz # MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------ -all: tmp/output/release/icd11foundation.owl tmp/output/release/ordo-icd11.sssom.tsv +all: ontology mappings + +ontology: tmp/output/release/icd11foundation.owl + +mappings: mappings-validate + +mappings-validate: tmp/output/release/ordo-icd11.sssom.tsv + sssom validate $< clean: rm -rf tmp/ @@ -42,13 +49,7 @@ tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.x wget https://www.orphadata.com/data/nomenclature/packs/Orphanet_Nomenclature_Pack_EN.zip -O tmp/input/Orphanet_Nomenclature_Pack_EN.zip unzip tmp/input/Orphanet_Nomenclature_Pack_EN.zip -d tmp/input/Orphanet_Nomenclature_Pack_EN -tmp/input/ordo.owl: | tmp/input/ - wget http://www.orphadata.org/data/ORDO/ordo_orphanet.owl -O $@ - -tmp/output/icd11mms-exact-matches.tsv: tmp/input/ordo.owl - robot query -i $< --query src/icd11mms-exact-matches.sparql $@ - -tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml tmp/output/icd11mms-exact-matches.tsv | tmp/output/release/ +tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml | tmp/output/release/ python3 src/mappings.py \ --input-nomenclature-xml tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml \ --input-sssom-config config/icd11.sssom-metadata.yml \ diff --git a/requirements-unlocked.txt b/requirements-unlocked.txt index b052e87..5f3fb30 100644 --- a/requirements-unlocked.txt +++ b/requirements-unlocked.txt @@ -1,2 +1,3 @@ pandas pyyaml +sssom diff --git a/requirements.txt b/requirements.txt index 6f194b3..a42f76a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,60 @@ -distlib==0.3.6 -filelock==3.9.0 -numpy==1.25.1 -pandas==2.0.3 -pbr==5.11.1 -platformdirs==3.1.0 -python-dateutil==2.8.2 -pytz==2023.3 +annotated-types==0.6.0 +attrs==23.2.0 +certifi==2024.2.2 +charset-normalizer==3.3.2 +click==8.1.7 +curies==0.7.9 +Deprecated==1.2.14 +deprecation==2.1.0 +distlib==0.3.8 +exceptiongroup==1.2.0 +filelock==3.13.1 +hbreader==0.9.1 +idna==3.6 +importlib_resources==6.4.0 +iniconfig==2.0.0 +isodate==0.6.1 +json-flattener==0.1.9 +jsonasobj2==1.0.4 +jsonschema==4.21.1 +jsonschema-specifications==2023.12.1 +linkml-runtime==1.7.5 +networkx==3.3 +numpy==1.26.4 +packaging==24.0 +pandas==2.2.1 +pansql==0.0.1 +pbr==6.0.0 +platformdirs==4.2.0 +pluggy==1.4.0 +prefixcommons==0.1.12 +prefixmaps==0.2.3 +pydantic==2.6.4 +pydantic_core==2.16.3 +pyparsing==3.1.2 +pytest==8.1.1 +pytest-logging==2015.11.4 +python-dateutil==2.9.0.post0 +PyTrie==0.4.0 +pytz==2024.1 PyYAML==6.0.1 +rdflib==7.0.0 +referencing==0.34.0 +requests==2.31.0 +rpds-py==0.18.0 +scipy==1.13.0 six==1.16.0 -stevedore==5.0.0 -tzdata==2023.3 -virtualenv==20.20.0 +sortedcontainers==2.4.0 +SPARQLWrapper==2.0.0 +SQLAlchemy==2.0.29 +sssom==0.4.6 +sssom-schema==0.15.2 +stevedore==5.1.0 +tomli==2.0.1 +typing_extensions==4.11.0 +tzdata==2024.1 +urllib3==2.2.1 +validators==0.28.0 +virtualenv==20.25.0 virtualenv-clone==0.5.7 +wrapt==1.16.0 diff --git a/src/icd11mms-exact-matches.sparql b/src/icd11mms-exact-matches.sparql deleted file mode 100644 index 59b0894..0000000 --- a/src/icd11mms-exact-matches.sparql +++ /dev/null @@ -1,22 +0,0 @@ -prefix skos: -prefix ECO: -prefix owl: -prefix oboInOwl: -prefix sssom: - -SELECT ?cls ?xref -WHERE { - VALUES ?mapping_pred { oboInOwl:hasDbXref } - - ?cls a owl:Class; - ?mapping_pred ?xref . - - ?xref_anno a owl:Axiom ; - owl:annotatedSource ?cls ; - owl:annotatedProperty ?mapping_pred ; - owl:annotatedTarget ?xref ; - ECO:0000218 ?mapping_precision_string . - - FILTER(STRSTARTS(STR(?mapping_precision_string), "- E (Exact mapping: the two concepts are equivalent).")) - FILTER(STRSTARTS(STR(?xref), "ICD-11:")) -} diff --git a/src/mappings.py b/src/mappings.py index 66511c3..a18c865 100644 --- a/src/mappings.py +++ b/src/mappings.py @@ -1,11 +1,19 @@ -"""Extract mappings""" +"""Extract mappings + +todo's (minor): + 1. icd11.sssom-metadata.yml: Generalize it by removing the 'mapping_provider' field, which links to Orphanet, and add + that in dynamically just for these ORDO mappings. +""" from argparse import ArgumentParser from pathlib import Path import xml.etree.ElementTree as eleTree +from typing import Dict +import curies import pandas as pd - -from utils import write_sssom +import yaml +from sssom import MappingSetDataFrame +from sssom.writers import write_table SRC_DIR = Path(__file__).parent PROJECT_DIR = SRC_DIR.parent @@ -131,7 +139,14 @@ def run( # Get only columns we care about df = df[['subject_id', 'subject_label', 'predicate_id', 'object_id']] - write_sssom(df, input_sssom_config, outpath) + + # Write file + with open(input_sssom_config, 'r') as yaml_file: + metadata: Dict = yaml.load(yaml_file, Loader=yaml.FullLoader) + converter = curies.Converter.from_prefix_map(metadata['curie_map']) + msdf: MappingSetDataFrame = MappingSetDataFrame(converter=converter, df=df, metadata=metadata) + with open(outpath, 'w') as f: + write_table(msdf, f) def cli(): diff --git a/src/temp_compare_matches_owl_and_nomenclature.py b/src/temp_compare_matches_owl_and_nomenclature.py deleted file mode 100644 index febaa44..0000000 --- a/src/temp_compare_matches_owl_and_nomenclature.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Temporary analysis file to compare exact matches to ICD11MMS between Orphanet OWL release and nomenclature files.""" -import pandas as pd - -from mappings import INPUT_MMS_MATCHES_TSV, xml_as_df - - -# Read and clean dataframes -nom_df = xml_as_df() -nom_df['predicate_id'] = nom_df['DisorderMappingRelation'].apply( - lambda x: 'skos:exactMatch' if x.startswith('E (') - else 'skos:narrowMatch' if x.startswith('NTBT (') - else 'skos:broadMatch' if x.startswith('BTNT (') - else 'skos:relatedMatch') # 0 instances of relatedMatch; just a fallback -nom_df = nom_df[nom_df['predicate_id'] == 'skos:exactMatch'][['OrphaCode', 'Reference']]\ - .sort_values(['OrphaCode', 'Reference']).reset_index(drop=True) - -owl_df = pd.read_csv(INPUT_MMS_MATCHES_TSV, sep="\t").rename(columns={ - '?cls': 'OrphaCode', - '?xref': 'Reference', -}).sort_values(['OrphaCode', 'Reference']).reset_index(drop=True) -owl_df['OrphaCode'] = owl_df['OrphaCode'].apply(lambda x: x.split('_')[1][:-1]) # get code and remove > char -owl_df['Reference'] = owl_df['Reference'].apply(lambda x: x.split(':')[1]) # get unprefixed code - -# Convert to sets of tuple edges -set_tups = lambda df: set(zip(df['OrphaCode'], df['Reference'])) -nom_set = set_tups(nom_df) # n=1,340 -owl_set = set_tups(owl_df) # n=1,459 -i = nom_set.intersection(owl_set) # n=1,323 -n_diff = nom_set.difference(owl_set) # n=17 -o_diff = owl_set.difference(nom_set) # n=136