Skip to content

Commit

Permalink
ORDO Mappings
Browse files Browse the repository at this point in the history
- Delete: Unnecessary files and goals: temp analysis, getting of ordo.owl and querying it via SPARQL.
- Add: SSSOM validation
- Update: Replaced ad hoc SSSOM utilities with the ones from sssom-py
- Add: mondo-icd11 mappings
- Rename: SSSOM files: icd11 -> icd11foundation
  • Loading branch information
joeflack4 committed Apr 16, 2024
1 parent 12dc3d1 commit 0f0110d
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 137 deletions.
7 changes: 5 additions & 2 deletions config/icd11.sssom-metadata.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
creator_id: 0000-0002-2906-7319
creator_id: orcid:0000-0002-2906-7319
curie_map:
Orphanet: http://www.orpha.net/ORDO/Orphanet_
icd11.foundation: http://id.who.int/icd/entity/
MONDO: http://purl.obolibrary.org/obo/MONDO_
oboInOwl: http://www.geneontology.org/formats/oboInOwl#
orcid: https://orcid.org/
owl: http://www.w3.org/2002/07/owl#
Orphanet: http://www.orpha.net/ORDO/Orphanet_
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs: http://www.w3.org/2000/01/rdf-schema#
semapv: https://w3id.org/semapv/
skos: http://www.w3.org/2004/02/skos/core#
sssom: https://w3id.org/sssom/
license: http://w3id.org/sssom/license/unspecified
mapping_provider: https://www.orpha.net/
39 changes: 29 additions & 10 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
.DEFAULT_GOAL := all
.PHONY: all release clean
.PHONY: all release clean ontology mappings mappings-validate help
TODAY ?=$(shell date +%Y-%m-%d)
VERSION=v$(TODAY)
SOURCE_URL=https://icd11files.blob.core.windows.net/tmp/whofic-2023-04-08.owl.gz


# MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------
all: tmp/output/release/icd11foundation.owl tmp/output/release/ordo-icd11.sssom.tsv
all: ontology mappings

ontology: tmp/output/release/icd11foundation.owl

mappings: mappings-validate

mappings-validate: tmp/output/release/ordo-icd11.sssom.tsv
sssom validate $<

clean:
rm -rf tmp/
Expand Down Expand Up @@ -37,22 +44,34 @@ release: | tmp/output/release/
gh release create $(VERSION) --notes "New release." --title "$(VERSION)" tmp/output/release/*

# Mappings
# todo: I'd like to do USE_PREBUILT=false, but this error is occuring:
# cp: cannot stat 'tmp/input/mondo/src/ontology/mappings/mondo.sssom.tsv': No such file or directory
# ...even though I checked and the file at that path does exist.
USE_PREBUILT=true

tmp/input/mondo/:
rm -rf $@ &&\
cd tmp/input/ &&\
git clone --depth 1 https://github.com/monarch-initiative/mondo

tmp/input/mondo.sssom.tsv: tmp/input/mondo/
if [ $(USE_PREBUILT) = true ]; then wget https://raw.githubusercontent.com/monarch-initiative/mondo/master/src/ontology/mappings/mondo.sssom.tsv -O $@; else cd tmp/input/ &&\
cd mondo/src/ontology &&\
make mondo.owl mappings -B MIR=false IMP=false MIR=false &&\
cp tmp/input/mondo/src/ontology/mappings/mondo.sssom.tsv $@; fi

# todo: Stable URI/filename issue: https://github.com/monarch-initiative/icd11/pull/12#discussion_r1542187711
tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml: | tmp/input/
wget https://www.orphadata.com/data/nomenclature/packs/Orphanet_Nomenclature_Pack_EN.zip -O tmp/input/Orphanet_Nomenclature_Pack_EN.zip
unzip tmp/input/Orphanet_Nomenclature_Pack_EN.zip -d tmp/input/Orphanet_Nomenclature_Pack_EN

tmp/input/ordo.owl: | tmp/input/
wget http://www.orphadata.org/data/ORDO/ordo_orphanet.owl -O $@

tmp/output/icd11mms-exact-matches.tsv: tmp/input/ordo.owl
robot query -i $< --query src/icd11mms-exact-matches.sparql $@

tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml tmp/output/icd11mms-exact-matches.tsv | tmp/output/release/
tmp/output/release/mondo-icd11foundation.sssom.tsv tmp/output/release/ordo-icd11foundation.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml tmp/input/mondo.sssom.tsv | tmp/output/release/
python3 src/mappings.py \
--input-nomenclature-xml tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml \
--input-sssom-config config/icd11.sssom-metadata.yml \
--outpath $@
--input-mondo-sssom tmp/input/mondo.sssom.tsv \
--outpath-ordo-mappings tmp/output/release/ordo-icd11.sssom.tsv \
--outpath-mondo-mappings tmp/output/release/mondo-icd11.sssom.tsv

# HELP -----------------------------------------------------------------------------------------------------------------
help:
Expand Down
1 change: 1 addition & 0 deletions requirements-unlocked.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pandas
pyyaml
sssom
68 changes: 57 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,60 @@
distlib==0.3.6
filelock==3.9.0
numpy==1.25.1
pandas==2.0.3
pbr==5.11.1
platformdirs==3.1.0
python-dateutil==2.8.2
pytz==2023.3
annotated-types==0.6.0
attrs==23.2.0
certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7
curies==0.7.9
Deprecated==1.2.14
deprecation==2.1.0
distlib==0.3.8
exceptiongroup==1.2.0
filelock==3.13.1
hbreader==0.9.1
idna==3.6
importlib_resources==6.4.0
iniconfig==2.0.0
isodate==0.6.1
json-flattener==0.1.9
jsonasobj2==1.0.4
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
linkml-runtime==1.7.5
networkx==3.3
numpy==1.26.4
packaging==24.0
pandas==2.2.1
pansql==0.0.1
pbr==6.0.0
platformdirs==4.2.0
pluggy==1.4.0
prefixcommons==0.1.12
prefixmaps==0.2.3
pydantic==2.6.4
pydantic_core==2.16.3
pyparsing==3.1.2
pytest==8.1.1
pytest-logging==2015.11.4
python-dateutil==2.9.0.post0
PyTrie==0.4.0
pytz==2024.1
PyYAML==6.0.1
rdflib==7.0.0
referencing==0.34.0
requests==2.31.0
rpds-py==0.18.0
scipy==1.13.0
six==1.16.0
stevedore==5.0.0
tzdata==2023.3
virtualenv==20.20.0
sortedcontainers==2.4.0
SPARQLWrapper==2.0.0
SQLAlchemy==2.0.29
sssom==0.4.6
sssom-schema==0.15.2
stevedore==5.1.0
tomli==2.0.1
typing_extensions==4.11.0
tzdata==2024.1
urllib3==2.2.1
validators==0.28.0
virtualenv==20.25.0
virtualenv-clone==0.5.7
wrapt==1.16.0
22 changes: 0 additions & 22 deletions src/icd11mms-exact-matches.sparql

This file was deleted.

81 changes: 67 additions & 14 deletions src/mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
"""Extract mappings"""
"""Extract mappings
todo's (minor):
1. icd11.sssom-metadata.yml: Generalize it by removing the 'mapping_provider' field, which links to Orphanet, and add
that in dynamically just for these ORDO mappings.
"""
from argparse import ArgumentParser
from pathlib import Path
import xml.etree.ElementTree as eleTree
Expand All @@ -13,13 +18,15 @@
INPUT_DIR = PROJECT_DIR / "tmp" / "input"
INTERMEDIATES_DIR = PROJECT_DIR / "tmp" / "output"
RELEASE_DIR = INTERMEDIATES_DIR / "release"
INPUT_NOMENCLATURE_XML = str(INPUT_DIR / "Orphanet_Nomenclature_Pack_EN" / "ORPHA_ICD11_mapping_en_newversion_2023.xml")
INPUT_MMS_MATCHES_TSV = str(INTERMEDIATES_DIR / "icd11mms-exact-matches.tsv")
INPATH_NOMENCLATURE_XML = \
str(INPUT_DIR / "Orphanet_Nomenclature_Pack_EN" / "ORPHA_ICD11_mapping_en_newversion_2023.xml")
INPUT_CONFIG = str(CONFIG_DIR / "icd11.sssom-metadata.yml")
OUTPUT_FILE = str(RELEASE_DIR / "ordo-icd11.sssom.tsv")
INPUT_MONDO_SSSOM = str(INPUT_DIR / 'mondo.sssom.tsv')
OUTPATH_ORDO = str(RELEASE_DIR / "ordo-icd11foundation.sssom.tsv")
OUTPATH_MONDO = str(RELEASE_DIR / "mondo-icd11foundation.sssom.tsv")


def xml_as_df(file_path: str = INPUT_NOMENCLATURE_XML) -> pd.DataFrame:
def xml_as_df(file_path: str = INPATH_NOMENCLATURE_XML) -> pd.DataFrame:
"""Parses XML and gets DF with the fields we care about.
Code source: https://chat.openai.com/share/1daaa4bc-3f6b-4379-b496-f88eff6a4ba0
Expand Down Expand Up @@ -108,11 +115,12 @@ def xml_as_df(file_path: str = INPUT_NOMENCLATURE_XML) -> pd.DataFrame:
df = pd.DataFrame(data)
return df

def run(
input_nomenclature_xml: str = INPUT_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
outpath: str = OUTPUT_FILE
):
"""Run"""

def ordo_sssom(
input_nomenclature_xml: str = INPATH_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
outpath_ordo_mappings: str = OUTPATH_ORDO,
) -> pd.DataFrame:
"""Create ORDO SSSOM mappings"""
# Get data
df: pd.DataFrame = xml_as_df(input_nomenclature_xml)

Expand All @@ -129,9 +137,49 @@ def run(
# Filter by only if ICD11Foundation ref exists && is exact match
df = df[(df['DisorderMappingICDRefUri'].notna()) & (df['predicate_id'] == 'skos:exactMatch')]

# Add mapping_justification
df['mapping_justification'] = 'semapv:UnspecifiedMatching'

# Get only columns we care about
df = df[['subject_id', 'subject_label', 'predicate_id', 'object_id']]
write_sssom(df, input_sssom_config, outpath)
df = df[['subject_id', 'subject_label', 'predicate_id', 'object_id', 'mapping_justification']]

# Save & return
write_sssom(df, input_sssom_config, outpath_ordo_mappings)
return df


def mondo_sssom(
df: pd.DataFrame, input_mondo_sssom: str = INPUT_MONDO_SSSOM, input_sssom_config: str = INPUT_CONFIG,
outpath_mondo_mappings: str = OUTPATH_MONDO,
) -> pd.DataFrame:
"""Create Mondo SSSOM mappings"""
mondo_sssom_df = pd.read_csv(input_mondo_sssom, sep='\t', comment='#')
mondo_sssom_df['object_prefix'] = mondo_sssom_df['object_id'].apply(lambda x: x.split(':')[0])
mondo_sssom_df = mondo_sssom_df[
(mondo_sssom_df['predicate_id'] == 'skos:exactMatch') &
(mondo_sssom_df['object_prefix'] == 'Orphanet')
].rename(
columns={'object_id': 'ordo_id'}
)[['subject_id', 'subject_label', 'ordo_id']]

df2 = df.rename(columns={'subject_id': 'ordo_id', 'subject_label': 'ordo_label'})\
.merge(mondo_sssom_df, how='inner', on='ordo_id')[
['subject_id', 'subject_label', 'predicate_id', 'object_id', 'mapping_justification']]

# Save & return
write_sssom(df2, input_sssom_config, outpath_mondo_mappings)
return df2


def run(
input_nomenclature_xml: str = INPATH_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
input_mondo_sssom: str = INPUT_MONDO_SSSOM, outpath_ordo_mappings: str = OUTPATH_ORDO,
outpath_mondo_mappings: str = OUTPATH_MONDO
):
"""Run"""
# Get data
df: pd.DataFrame = ordo_sssom(input_nomenclature_xml, input_sssom_config, outpath_ordo_mappings)
mondo_sssom(df, input_mondo_sssom, input_sssom_config, outpath_mondo_mappings)


def cli():
Expand All @@ -140,12 +188,17 @@ def cli():
prog='Create SSSOM outputs',
description='Create SSSOM outputs from Orphanet source')
parser.add_argument(
'-n', '--input-nomenclature-xml', default=INPUT_NOMENCLATURE_XML,
'-n', '--input-nomenclature-xml', default=INPATH_NOMENCLATURE_XML,
help='Path to ICD11 mapping XML file from the Orphanet nomenclature pack.')
parser.add_argument(
'-c', '--input-sssom-config', default=INPUT_CONFIG, help='Path to SSSOM config yml.')
parser.add_argument(
'-o', '--outpath', default=OUTPUT_FILE, help='Path to save SSSOM TSV.')
'-s', '--input-mondo-sssom', default=INPUT_MONDO_SSSOM, help='Path to mondo.sssom.tsv.')
parser.add_argument(
'-o', '--outpath-ordo-mappings', default=OUTPATH_ORDO, help='Path to save ORDO->ICD11 SSSOM TSV.')
parser.add_argument(
'-m', '--outpath-mondo-mappings', default=OUTPATH_MONDO,
help='Path to save MONDO->ORDO->ICD11 SSSOM TSV.')
run(**vars(parser.parse_args()))


Expand Down
30 changes: 0 additions & 30 deletions src/temp_compare_matches_owl_and_nomenclature.py

This file was deleted.

Loading

0 comments on commit 0f0110d

Please sign in to comment.