Skip to content

Commit

Permalink
ORDO Mappings
Browse files Browse the repository at this point in the history
- Delete: Unnecessary files and goals: temp analysis, getting of ordo.owl and querying it via SPARQL.
- Add: SSSOM validation
- Update: Replaced ad hoc SSSOM utilities with the ones from sssom-py
- Add: mondo-icd11 mappings
  • Loading branch information
joeflack4 committed Apr 15, 2024
1 parent 12dc3d1 commit 624aa09
Show file tree
Hide file tree
Showing 8 changed files with 167 additions and 136 deletions.
7 changes: 5 additions & 2 deletions config/icd11.sssom-metadata.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
creator_id: 0000-0002-2906-7319
creator_id: orcid:0000-0002-2906-7319
curie_map:
Orphanet: http://www.orpha.net/ORDO/Orphanet_
icd11.foundation: http://id.who.int/icd/entity/
MONDO: http://purl.obolibrary.org/obo/MONDO_
oboInOwl: http://www.geneontology.org/formats/oboInOwl#
orcid: https://orcid.org/
owl: http://www.w3.org/2002/07/owl#
Orphanet: http://www.orpha.net/ORDO/Orphanet_
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs: http://www.w3.org/2000/01/rdf-schema#
semapv: https://w3id.org/semapv/
skos: http://www.w3.org/2004/02/skos/core#
sssom: https://w3id.org/sssom/
license: http://w3id.org/sssom/license/unspecified
mapping_provider: https://www.orpha.net/
39 changes: 29 additions & 10 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
.DEFAULT_GOAL := all
.PHONY: all release clean
.PHONY: all release clean ontology mappings mappings-validate help
TODAY ?=$(shell date +%Y-%m-%d)
VERSION=v$(TODAY)
SOURCE_URL=https://icd11files.blob.core.windows.net/tmp/whofic-2023-04-08.owl.gz


# MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------
all: tmp/output/release/icd11foundation.owl tmp/output/release/ordo-icd11.sssom.tsv
all: ontology mappings

ontology: tmp/output/release/icd11foundation.owl

mappings: mappings-validate

mappings-validate: tmp/output/release/ordo-icd11.sssom.tsv
sssom validate $<

clean:
rm -rf tmp/
Expand Down Expand Up @@ -37,22 +44,34 @@ release: | tmp/output/release/
gh release create $(VERSION) --notes "New release." --title "$(VERSION)" tmp/output/release/*

# Mappings
# todo: I'd like to do USE_PREBUILT=false, but this error is occuring:
# cp: cannot stat 'tmp/input/mondo/src/ontology/mappings/mondo.sssom.tsv': No such file or directory
# ...even though I checked and the file at that path does exist.
USE_PREBUILT=true

tmp/input/mondo/:
rm -rf $@ &&\
cd tmp/input/ &&\
git clone --depth 1 https://github.com/monarch-initiative/mondo

tmp/input/mondo.sssom.tsv: tmp/input/mondo/
if [ $(USE_PREBUILT) = true ]; then wget https://raw.githubusercontent.com/monarch-initiative/mondo/master/src/ontology/mappings/mondo.sssom.tsv -O $@; else cd tmp/input/ &&\
cd mondo/src/ontology &&\
make mondo.owl mappings -B MIR=false IMP=false MIR=false &&\
cp tmp/input/mondo/src/ontology/mappings/mondo.sssom.tsv $@; fi

# todo: Stable URI/filename issue: https://github.com/monarch-initiative/icd11/pull/12#discussion_r1542187711
tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml: | tmp/input/
wget https://www.orphadata.com/data/nomenclature/packs/Orphanet_Nomenclature_Pack_EN.zip -O tmp/input/Orphanet_Nomenclature_Pack_EN.zip
unzip tmp/input/Orphanet_Nomenclature_Pack_EN.zip -d tmp/input/Orphanet_Nomenclature_Pack_EN

tmp/input/ordo.owl: | tmp/input/
wget http://www.orphadata.org/data/ORDO/ordo_orphanet.owl -O $@

tmp/output/icd11mms-exact-matches.tsv: tmp/input/ordo.owl
robot query -i $< --query src/icd11mms-exact-matches.sparql $@

tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml tmp/output/icd11mms-exact-matches.tsv | tmp/output/release/
tmp/output/release/mondo-icd11.sssom.tsv tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml tmp/input/mondo.sssom.tsv | tmp/output/release/
python3 src/mappings.py \
--input-nomenclature-xml tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml \
--input-sssom-config config/icd11.sssom-metadata.yml \
--outpath $@
--input-mondo-sssom tmp/input/mondo.sssom.tsv \
--outpath-ordo-mappings tmp/output/release/ordo-icd11.sssom.tsv \
--outpath-mondo-mappings tmp/output/release/mondo-icd11.sssom.tsv

# HELP -----------------------------------------------------------------------------------------------------------------
help:
Expand Down
1 change: 1 addition & 0 deletions requirements-unlocked.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pandas
pyyaml
sssom
68 changes: 57 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,60 @@
distlib==0.3.6
filelock==3.9.0
numpy==1.25.1
pandas==2.0.3
pbr==5.11.1
platformdirs==3.1.0
python-dateutil==2.8.2
pytz==2023.3
annotated-types==0.6.0
attrs==23.2.0
certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7
curies==0.7.9
Deprecated==1.2.14
deprecation==2.1.0
distlib==0.3.8
exceptiongroup==1.2.0
filelock==3.13.1
hbreader==0.9.1
idna==3.6
importlib_resources==6.4.0
iniconfig==2.0.0
isodate==0.6.1
json-flattener==0.1.9
jsonasobj2==1.0.4
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
linkml-runtime==1.7.5
networkx==3.3
numpy==1.26.4
packaging==24.0
pandas==2.2.1
pansql==0.0.1
pbr==6.0.0
platformdirs==4.2.0
pluggy==1.4.0
prefixcommons==0.1.12
prefixmaps==0.2.3
pydantic==2.6.4
pydantic_core==2.16.3
pyparsing==3.1.2
pytest==8.1.1
pytest-logging==2015.11.4
python-dateutil==2.9.0.post0
PyTrie==0.4.0
pytz==2024.1
PyYAML==6.0.1
rdflib==7.0.0
referencing==0.34.0
requests==2.31.0
rpds-py==0.18.0
scipy==1.13.0
six==1.16.0
stevedore==5.0.0
tzdata==2023.3
virtualenv==20.20.0
sortedcontainers==2.4.0
SPARQLWrapper==2.0.0
SQLAlchemy==2.0.29
sssom==0.4.6
sssom-schema==0.15.2
stevedore==5.1.0
tomli==2.0.1
typing_extensions==4.11.0
tzdata==2024.1
urllib3==2.2.1
validators==0.28.0
virtualenv==20.25.0
virtualenv-clone==0.5.7
wrapt==1.16.0
22 changes: 0 additions & 22 deletions src/icd11mms-exact-matches.sparql

This file was deleted.

76 changes: 63 additions & 13 deletions src/mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
"""Extract mappings"""
"""Extract mappings
todo's (minor):
1. icd11.sssom-metadata.yml: Generalize it by removing the 'mapping_provider' field, which links to Orphanet, and add
that in dynamically just for these ORDO mappings.
"""
from argparse import ArgumentParser
from pathlib import Path
import xml.etree.ElementTree as eleTree
Expand All @@ -13,13 +18,15 @@
INPUT_DIR = PROJECT_DIR / "tmp" / "input"
INTERMEDIATES_DIR = PROJECT_DIR / "tmp" / "output"
RELEASE_DIR = INTERMEDIATES_DIR / "release"
INPUT_NOMENCLATURE_XML = str(INPUT_DIR / "Orphanet_Nomenclature_Pack_EN" / "ORPHA_ICD11_mapping_en_newversion_2023.xml")
INPUT_MMS_MATCHES_TSV = str(INTERMEDIATES_DIR / "icd11mms-exact-matches.tsv")
INPATH_NOMENCLATURE_XML = \
str(INPUT_DIR / "Orphanet_Nomenclature_Pack_EN" / "ORPHA_ICD11_mapping_en_newversion_2023.xml")
INPUT_CONFIG = str(CONFIG_DIR / "icd11.sssom-metadata.yml")
OUTPUT_FILE = str(RELEASE_DIR / "ordo-icd11.sssom.tsv")
INPUT_MONDO_SSSOM = str(INPUT_DIR / 'mondo.sssom.tsv')
OUTPATH_ORDO = str(RELEASE_DIR / "ordo-icd11.sssom.tsv")
OUTPATH_MONDO = str(RELEASE_DIR / "mondo-icd11.sssom.tsv")


def xml_as_df(file_path: str = INPUT_NOMENCLATURE_XML) -> pd.DataFrame:
def xml_as_df(file_path: str = INPATH_NOMENCLATURE_XML) -> pd.DataFrame:
"""Parses XML and gets DF with the fields we care about.
Code source: https://chat.openai.com/share/1daaa4bc-3f6b-4379-b496-f88eff6a4ba0
Expand Down Expand Up @@ -108,11 +115,12 @@ def xml_as_df(file_path: str = INPUT_NOMENCLATURE_XML) -> pd.DataFrame:
df = pd.DataFrame(data)
return df

def run(
input_nomenclature_xml: str = INPUT_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
outpath: str = OUTPUT_FILE
):
"""Run"""

def ordo_sssom(
input_nomenclature_xml: str = INPATH_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
outpath_ordo_mappings: str = OUTPATH_ORDO,
) -> pd.DataFrame:
"""Create ORDO SSSOM mappings"""
# Get data
df: pd.DataFrame = xml_as_df(input_nomenclature_xml)

Expand All @@ -131,7 +139,44 @@ def run(

# Get only columns we care about
df = df[['subject_id', 'subject_label', 'predicate_id', 'object_id']]
write_sssom(df, input_sssom_config, outpath)

# Save & return
write_sssom(df, input_sssom_config, outpath_ordo_mappings)
return df


def mondo_sssom(
df: pd.DataFrame, input_mondo_sssom: str = INPUT_MONDO_SSSOM, input_sssom_config: str = INPUT_CONFIG,
outpath_mondo_mappings: str = OUTPATH_MONDO,
) -> pd.DataFrame:
"""Create Mondo SSSOM mappings"""
mondo_sssom_df = pd.read_csv(input_mondo_sssom, sep='\t', comment='#')
mondo_sssom_df['object_prefix'] = mondo_sssom_df['object_id'].apply(lambda x: x.split(':')[0])
mondo_sssom_df = mondo_sssom_df[
(mondo_sssom_df['predicate_id'] == 'skos:exactMatch') &
(mondo_sssom_df['object_prefix'] == 'Orphanet')
].rename(
columns={'object_id': 'ordo_id'}
)[['subject_id', 'subject_label', 'ordo_id']]

df2 = df.rename(columns={'subject_id': 'ordo_id', 'subject_label': 'ordo_label'})\
.merge(mondo_sssom_df, how='inner', on='ordo_id')[
['subject_id', 'subject_label', 'predicate_id', 'object_id']]

# Save & return
write_sssom(df2, input_sssom_config, outpath_mondo_mappings)
return df2


def run(
input_nomenclature_xml: str = INPATH_NOMENCLATURE_XML, input_sssom_config: str = INPUT_CONFIG,
input_mondo_sssom: str = INPUT_MONDO_SSSOM, outpath_ordo_mappings: str = OUTPATH_ORDO,
outpath_mondo_mappings: str = OUTPATH_MONDO
):
"""Run"""
# Get data
df: pd.DataFrame = ordo_sssom(input_nomenclature_xml, input_sssom_config, outpath_ordo_mappings)
mondo_sssom(df, input_mondo_sssom, input_sssom_config, outpath_mondo_mappings)


def cli():
Expand All @@ -140,12 +185,17 @@ def cli():
prog='Create SSSOM outputs',
description='Create SSSOM outputs from Orphanet source')
parser.add_argument(
'-n', '--input-nomenclature-xml', default=INPUT_NOMENCLATURE_XML,
'-n', '--input-nomenclature-xml', default=INPATH_NOMENCLATURE_XML,
help='Path to ICD11 mapping XML file from the Orphanet nomenclature pack.')
parser.add_argument(
'-c', '--input-sssom-config', default=INPUT_CONFIG, help='Path to SSSOM config yml.')
parser.add_argument(
'-o', '--outpath', default=OUTPUT_FILE, help='Path to save SSSOM TSV.')
'-s', '--input-mondo-sssom', default=INPUT_MONDO_SSSOM, help='Path to mondo.sssom.tsv.')
parser.add_argument(
'-o', '--outpath-ordo-mappings', default=OUTPATH_ORDO, help='Path to save ORDO->ICD11 SSSOM TSV.')
parser.add_argument(
'-m', '--outpath-mondo-mappings', default=OUTPATH_MONDO,
help='Path to save MONDO->ORDO->ICD11 SSSOM TSV.')
run(**vars(parser.parse_args()))


Expand Down
30 changes: 0 additions & 30 deletions src/temp_compare_matches_owl_and_nomenclature.py

This file was deleted.

60 changes: 12 additions & 48 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,20 @@
"""Utilities"""
import os
from pathlib import Path
from typing import Set, Union
from typing import Dict, Union

import curies
import pandas as pd
import yaml
from sssom import MappingSetDataFrame
from sssom.writers import write_table


def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]:
"""Find prefixes in mapping set"""
df = source_df.copy()
cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id']
prefixes = set()
for col in cols_with_prefixes:
col2 = col.replace('id', 'prefix')
df[col2] = df[col].apply(lambda x: x.split(':')[0]
if isinstance(x, str) else x) # handles nan
prefixes.update(set(df[col2].to_list()))
return prefixes


# todo: Add to sssom-py. Shared between, at the least, ICD11 and MedGen repos
def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]):
"""Writes a SSSOM file with commented metadata at the top of the file.
Filters only prefxes in curie_map that exist in the mapping set."""
temp_filtered_config_path = str(config_path) + '.tmp'
# Load config
config = yaml.safe_load(open(config_path, 'r'))
# Filter curie_map
prefixes: Set[str] = find_prefixes_in_mapping_set(df)
config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes}
# Write
with open(temp_filtered_config_path, 'w') as f:
yaml.dump(config, f)
write_tsv_with_comments(df, temp_filtered_config_path, outpath)
os.remove(temp_filtered_config_path)


def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]):
"""Write a TSV with comments at the top"""
# write metadata
f = open(comments_file, "r")
lines = f.readlines()
f.close()
output_lines = []
for line in lines:
output_lines.append("# " + line)
metadata_str = ''.join(output_lines)
if os.path.exists(outpath):
os.remove(outpath)
f = open(outpath, 'a')
f.write(metadata_str)
f.close()
# write data
df.to_csv(outpath, index=False, sep='\t', mode='a')
"""Writes a SSSOM file"""
with open(config_path, 'r') as yaml_file:
metadata: Dict = yaml.load(yaml_file, Loader=yaml.FullLoader)
converter = curies.Converter.from_prefix_map(metadata['curie_map'])
msdf: MappingSetDataFrame = MappingSetDataFrame(converter=converter, df=df, metadata=metadata)
with open(outpath, 'w') as f:
write_table(msdf, f)

0 comments on commit 624aa09

Please sign in to comment.