From 765587772bdddf958d99dbe785e5ec0c18d3aa99 Mon Sep 17 00:00:00 2001
From: Joe Flack <joeflack4@gmail.com>
Date: Sat, 6 Apr 2024 18:27:30 -0400
Subject: [PATCH] ORDO Mappings - Delete: Unnecessary files and goals: temp
 analysis, getting of ordo.owl and querying it via SPARQL. - Add: SSSOM
 validation - Update: Replaced ad hoc SSSOM utilities with the ones from
 sssom-py

---
 config/icd11.sssom-metadata.yml               |  4 +-
 makefile                                      | 19 +++---
 requirements-unlocked.txt                     |  1 +
 requirements.txt                              | 68 ++++++++++++++++---
 src/icd11mms-exact-matches.sparql             | 22 ------
 src/mappings.py                               | 23 +++++--
 ...mp_compare_matches_owl_and_nomenclature.py | 30 --------
 7 files changed, 90 insertions(+), 77 deletions(-)
 delete mode 100644 src/icd11mms-exact-matches.sparql
 delete mode 100644 src/temp_compare_matches_owl_and_nomenclature.py

diff --git a/config/icd11.sssom-metadata.yml b/config/icd11.sssom-metadata.yml
index 61fb5a3..8d656bd 100644
--- a/config/icd11.sssom-metadata.yml
+++ b/config/icd11.sssom-metadata.yml
@@ -1,8 +1,9 @@
-creator_id: 0000-0002-2906-7319
+creator_id: orcid:0000-0002-2906-7319
 curie_map:
   Orphanet: http://www.orpha.net/ORDO/Orphanet_
   icd11.foundation: http://id.who.int/icd/entity/
   oboInOwl: http://www.geneontology.org/formats/oboInOwl#
+  orcid: https://orcid.org/
   owl: http://www.w3.org/2002/07/owl#
   rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
   rdfs: http://www.w3.org/2000/01/rdf-schema#
@@ -10,3 +11,4 @@ curie_map:
   skos: http://www.w3.org/2004/02/skos/core#
   sssom: https://w3id.org/sssom/
 license: http://w3id.org/sssom/license/unspecified
+mapping_provider: https://www.orpha.net/
diff --git a/makefile b/makefile
index 76d3244..58ee7bf 100644
--- a/makefile
+++ b/makefile
@@ -1,12 +1,19 @@
 .DEFAULT_GOAL := all
-.PHONY: all release clean
+.PHONY: all release clean ontology mappings mappings-validate help
 TODAY ?=$(shell date +%Y-%m-%d)
 VERSION=v$(TODAY)
 SOURCE_URL=https://icd11files.blob.core.windows.net/tmp/whofic-2023-04-08.owl.gz
 
 
 # MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------
-all: tmp/output/release/icd11foundation.owl tmp/output/release/ordo-icd11.sssom.tsv
+all: ontology mappings
+
+ontology: tmp/output/release/icd11foundation.owl
+
+mappings: mappings-validate
+
+mappings-validate: tmp/output/release/ordo-icd11.sssom.tsv
+	sssom validate $<
 
 clean:
 	rm -rf tmp/
@@ -42,13 +49,7 @@ tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.x
 	wget https://www.orphadata.com/data/nomenclature/packs/Orphanet_Nomenclature_Pack_EN.zip -O tmp/input/Orphanet_Nomenclature_Pack_EN.zip
 	unzip tmp/input/Orphanet_Nomenclature_Pack_EN.zip -d tmp/input/Orphanet_Nomenclature_Pack_EN
 
-tmp/input/ordo.owl: | tmp/input/
-	wget http://www.orphadata.org/data/ORDO/ordo_orphanet.owl -O $@
-
-tmp/output/icd11mms-exact-matches.tsv: tmp/input/ordo.owl
-	robot query -i $< --query src/icd11mms-exact-matches.sparql $@
-
-tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml tmp/output/icd11mms-exact-matches.tsv | tmp/output/release/
+tmp/output/release/ordo-icd11.sssom.tsv: tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml | tmp/output/release/
 	python3 src/mappings.py \
 	--input-nomenclature-xml tmp/input/Orphanet_Nomenclature_Pack_EN/ORPHA_ICD11_mapping_en_newversion_2023.xml \
 	--input-sssom-config config/icd11.sssom-metadata.yml \
diff --git a/requirements-unlocked.txt b/requirements-unlocked.txt
index b052e87..5f3fb30 100644
--- a/requirements-unlocked.txt
+++ b/requirements-unlocked.txt
@@ -1,2 +1,3 @@
 pandas
 pyyaml
+sssom
diff --git a/requirements.txt b/requirements.txt
index 6f194b3..a42f76a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,60 @@
-distlib==0.3.6
-filelock==3.9.0
-numpy==1.25.1
-pandas==2.0.3
-pbr==5.11.1
-platformdirs==3.1.0
-python-dateutil==2.8.2
-pytz==2023.3
+annotated-types==0.6.0
+attrs==23.2.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+curies==0.7.9
+Deprecated==1.2.14
+deprecation==2.1.0
+distlib==0.3.8
+exceptiongroup==1.2.0
+filelock==3.13.1
+hbreader==0.9.1
+idna==3.6
+importlib_resources==6.4.0
+iniconfig==2.0.0
+isodate==0.6.1
+json-flattener==0.1.9
+jsonasobj2==1.0.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+linkml-runtime==1.7.5
+networkx==3.3
+numpy==1.26.4
+packaging==24.0
+pandas==2.2.1
+pansql==0.0.1
+pbr==6.0.0
+platformdirs==4.2.0
+pluggy==1.4.0
+prefixcommons==0.1.12
+prefixmaps==0.2.3
+pydantic==2.6.4
+pydantic_core==2.16.3
+pyparsing==3.1.2
+pytest==8.1.1
+pytest-logging==2015.11.4
+python-dateutil==2.9.0.post0
+PyTrie==0.4.0
+pytz==2024.1
 PyYAML==6.0.1
+rdflib==7.0.0
+referencing==0.34.0
+requests==2.31.0
+rpds-py==0.18.0
+scipy==1.13.0
 six==1.16.0
-stevedore==5.0.0
-tzdata==2023.3
-virtualenv==20.20.0
+sortedcontainers==2.4.0
+SPARQLWrapper==2.0.0
+SQLAlchemy==2.0.29
+sssom==0.4.6
+sssom-schema==0.15.2
+stevedore==5.1.0
+tomli==2.0.1
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+validators==0.28.0
+virtualenv==20.25.0
 virtualenv-clone==0.5.7
+wrapt==1.16.0
diff --git a/src/icd11mms-exact-matches.sparql b/src/icd11mms-exact-matches.sparql
deleted file mode 100644
index 59b0894..0000000
--- a/src/icd11mms-exact-matches.sparql
+++ /dev/null
@@ -1,22 +0,0 @@
-prefix skos: <http://www.w3.org/2004/02/skos/core#>
-prefix ECO: <http://purl.obolibrary.org/obo/ECO_>
-prefix owl: <http://www.w3.org/2002/07/owl#>
-prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
-prefix sssom: <https://w3id.org/sssom/>
-
-SELECT ?cls ?xref
-WHERE {
-  VALUES ?mapping_pred { oboInOwl:hasDbXref }
-
-  ?cls a owl:Class;
-    ?mapping_pred ?xref .
-
-  ?xref_anno a owl:Axiom ;
-    owl:annotatedSource ?cls ;
-    owl:annotatedProperty ?mapping_pred ;
-    owl:annotatedTarget ?xref ;
-    ECO:0000218 ?mapping_precision_string .
-
-  FILTER(STRSTARTS(STR(?mapping_precision_string), "- E (Exact mapping: the two concepts are equivalent)."))
-  FILTER(STRSTARTS(STR(?xref), "ICD-11:"))
-}
diff --git a/src/mappings.py b/src/mappings.py
index 66511c3..a18c865 100644
--- a/src/mappings.py
+++ b/src/mappings.py
@@ -1,11 +1,19 @@
-"""Extract mappings"""
+"""Extract mappings
+
+todo's (minor):
+ 1. icd11.sssom-metadata.yml: Generalize it by removing the 'mapping_provider' field, which links to Orphanet, and add
+ that in dynamically just for these ORDO mappings.
+"""
 from argparse import ArgumentParser
 from pathlib import Path
 import xml.etree.ElementTree as eleTree
+from typing import Dict
 
+import curies
 import pandas as pd
-
-from utils import write_sssom
+import yaml
+from sssom import MappingSetDataFrame
+from sssom.writers import write_table
 
 SRC_DIR = Path(__file__).parent
 PROJECT_DIR = SRC_DIR.parent
@@ -131,7 +139,14 @@ def run(
 
     # Get only columns we care about
     df = df[['subject_id', 'subject_label', 'predicate_id', 'object_id']]
-    write_sssom(df, input_sssom_config, outpath)
+
+    # Write file
+    with open(input_sssom_config, 'r') as yaml_file:
+        metadata: Dict = yaml.load(yaml_file, Loader=yaml.FullLoader)
+    converter = curies.Converter.from_prefix_map(metadata['curie_map'])
+    msdf: MappingSetDataFrame = MappingSetDataFrame(converter=converter, df=df, metadata=metadata)
+    with open(outpath, 'w') as f:
+        write_table(msdf, f)
 
 
 def cli():
diff --git a/src/temp_compare_matches_owl_and_nomenclature.py b/src/temp_compare_matches_owl_and_nomenclature.py
deleted file mode 100644
index febaa44..0000000
--- a/src/temp_compare_matches_owl_and_nomenclature.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""Temporary analysis file to compare exact matches to ICD11MMS between Orphanet OWL release and nomenclature files."""
-import pandas as pd
-
-from mappings import INPUT_MMS_MATCHES_TSV, xml_as_df
-
-
-# Read and clean dataframes
-nom_df = xml_as_df()
-nom_df['predicate_id'] = nom_df['DisorderMappingRelation'].apply(
-    lambda x: 'skos:exactMatch' if x.startswith('E (')
-    else 'skos:narrowMatch' if x.startswith('NTBT (')
-    else 'skos:broadMatch' if x.startswith('BTNT (')
-    else 'skos:relatedMatch')  # 0 instances of relatedMatch; just a fallback
-nom_df = nom_df[nom_df['predicate_id'] == 'skos:exactMatch'][['OrphaCode', 'Reference']]\
-    .sort_values(['OrphaCode', 'Reference']).reset_index(drop=True)
-
-owl_df = pd.read_csv(INPUT_MMS_MATCHES_TSV, sep="\t").rename(columns={
-    '?cls': 'OrphaCode',
-    '?xref': 'Reference',
-}).sort_values(['OrphaCode', 'Reference']).reset_index(drop=True)
-owl_df['OrphaCode'] = owl_df['OrphaCode'].apply(lambda x: x.split('_')[1][:-1])  # get code and remove > char
-owl_df['Reference'] = owl_df['Reference'].apply(lambda x: x.split(':')[1])  # get unprefixed code
-
-# Convert to sets of tuple edges
-set_tups = lambda df: set(zip(df['OrphaCode'], df['Reference']))
-nom_set = set_tups(nom_df)  # n=1,340
-owl_set = set_tups(owl_df)  # n=1,459
-i = nom_set.intersection(owl_set)  # n=1,323
-n_diff = nom_set.difference(owl_set)  # n=17
-o_diff = owl_set.difference(nom_set)  # n=136