Skip to content

Commit

Permalink
SSSOM outputs
Browse files Browse the repository at this point in the history
- Update: Utilizing standard sssom-py functionality
- Add: sssom validate
- Add: mapping_justification
- Update: Subject/object order & filename
- Add: Mandatory filtering of UMLS matches only in HPO-UMLS SSSOM.
  • Loading branch information
joeflack4 committed Apr 7, 2024
1 parent bd2ff8a commit 0dd37ea
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/buid_and_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ jobs:
output/release/medgen.obo
output/release/medgen-disease-extract.obo
output/release/medgen-xrefs.robot.template.tsv
output/release/hpo-umls.sssom.tsv
output/release/umls-hpo.sssom.tsv
output/release/hpo-mesh.sssom.tsv
4 changes: 2 additions & 2 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ medgen-disease-extract.owl: medgen-disease-extract.obo
#
#medgen.sssom.tsv: medgen.obographs.json
# sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@
sssom: hpo-umls.sssom.tsv
sssom: umls-hpo.sssom.tsv

hpo-umls.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt
umls-hpo.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt
python src/create_sssom.py --input-mappings $< --input-sssom-config config/medgen.sssom-metadata.yml

# ----------------------------------------
Expand Down
1 change: 1 addition & 0 deletions requirements-unlocked.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pandas
pyyaml
sssom
47 changes: 47 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,61 @@
annotated-types==0.6.0
attrs==23.2.0
certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7
curies==0.7.9
Deprecated==1.2.14
deprecation==2.1.0
distlib==0.3.6
exceptiongroup==1.2.0
filelock==3.9.0
hbreader==0.9.1
idna==3.6
importlib_resources==6.4.0
iniconfig==2.0.0
isodate==0.6.1
json-flattener==0.1.9
jsonasobj2==1.0.4
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
linkml-runtime==1.7.5
networkx==3.3
numpy==1.25.1
packaging==24.0
pandas==2.0.3
pansql==0.0.1
pbr==5.11.1
platformdirs==3.1.0
pluggy==1.4.0
prefixcommons==0.1.12
prefixmaps==0.2.3
pydantic==2.6.4
pydantic_core==2.16.3
pyparsing==3.1.2
pytest==8.1.1
pytest-logging==2015.11.4
python-dateutil==2.8.2
PyTrie==0.4.0
pytz==2023.3
PyYAML==6.0.1
rdflib==7.0.0
referencing==0.34.0
requests==2.31.0
rpds-py==0.18.0
scipy==1.13.0
six==1.16.0
sortedcontainers==2.4.0
SPARQLWrapper==2.0.0
SQLAlchemy==2.0.29
sssom==0.4.6
sssom-schema==0.15.2
stevedore==5.0.0
tomli==2.0.1
typing_extensions==4.11.0
tzdata==2023.3
urllib3==2.2.1
validators==0.28.0
virtualenv==20.20.0
virtualenv-clone==0.5.7
virtualenvwrapper==4.8.4
wrapt==1.16.0
18 changes: 14 additions & 4 deletions src/create_sssom.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
CONFIG_DIR = PROJECT_DIR / "config"
INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt")
INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml")
OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "hpo-umls.sssom.tsv")
OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "umls-hpo.sssom.tsv")
OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv")


Expand All @@ -21,10 +21,17 @@ def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame:
return df[df['source'] == source][['subject_id', 'subject_label', 'predicate_id', 'object_id']]


def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG):
"""Create SSSOM outputs"""
def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG, hpo_match_only_with_umls=True):
"""Create SSSOM outputs
:param hpo_match_only_with_umls: If True, only create SSSOM outputs for HPO mappings that have UMLS mappings, and
will filter out other matches. This is purely edge case handling. As of 2024/04/06, 100% of the mappings were UMLS
anyway."""
# SSSOM 1: HPO<->UMLS
df_hpo_umls = get_mapping_set(input_mappings, ['HPO'], add_prefixes=True)
if hpo_match_only_with_umls:
df_hpo_umls = df_hpo_umls[df_hpo_umls['subject_id'].str.startswith('UMLS:')]
df_hpo_umls['mapping_justification'] = 'semapv:ManualMappingCuration'
write_sssom(df_hpo_umls, input_sssom_config, OUTPUT_FILE_HPO_UMLS)

# SSSOM 2: HPO<->MeSH
Expand All @@ -45,7 +52,10 @@ def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CO
df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x)
# todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to
# move the col removals below (umls) to above
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-no-matches.sssom.tsv'))
# - add mapping_justification
df_hpo_mesh['mapping_justification'] = 'semapv:ManualMappingCuration'
write_sssom(df_hpo_mesh, input_sssom_config,
OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-non-matches-included.sssom.tsv'))
# -- filter non-matches & drop unneeded cols
df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[
x for x in df_hpo_mesh.columns if not x.startswith('umls')]]
Expand Down
59 changes: 11 additions & 48 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Utils"""
import os
from pathlib import Path
from typing import List, Set, Union
from typing import Dict, List, Union

import curies
import pandas as pd
import yaml
from sssom import MappingSetDataFrame
from sssom.writers import write_table


def add_prefixes_to_plain_id(x: str) -> str:
Expand All @@ -21,53 +23,14 @@ def add_prefixes_to_plain_id(x: str) -> str:
else f'MEDGEN:{x}'


def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]:
"""Find prefixes in mapping set"""
df = source_df.copy()
cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id']
prefixes = set()
for col in cols_with_prefixes:
col2 = col.replace('id', 'prefix')
df[col2] = df[col].apply(lambda x: x.split(':')[0]
if isinstance(x, str) else x) # handles nan
prefixes.update(set(df[col2].to_list()))
return prefixes


def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]):
"""Writes a SSSOM file with commented metadata at the top of the file.
Filters only prefxes in curie_map that exist in the mapping set."""
temp_filtered_config_path = str(config_path) + '.tmp'
# Load config
config = yaml.safe_load(open(config_path, 'r'))
# Filter curie_map
prefixes: Set[str] = find_prefixes_in_mapping_set(df)
config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes}
# Write
with open(temp_filtered_config_path, 'w') as f:
yaml.dump(config, f)
write_tsv_with_comments(df, temp_filtered_config_path, outpath)
os.remove(temp_filtered_config_path)


def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]):
"""Write a TSV with comments at the top"""
# write metadata
f = open(comments_file, "r")
lines = f.readlines()
f.close()
output_lines = []
for line in lines:
output_lines.append("# " + line)
metadata_str = ''.join(output_lines)
if os.path.exists(outpath):
os.remove(outpath)
f = open(outpath, 'a')
f.write(metadata_str)
f.close()
# write data
df.to_csv(outpath, index=False, sep='\t', mode='a')
"""Writes a SSSOM file"""
with open(config_path, 'r') as yaml_file:
metadata: Dict = yaml.load(yaml_file, Loader=yaml.FullLoader)
converter = curies.Converter.from_prefix_map(metadata['curie_map'])
msdf: MappingSetDataFrame = MappingSetDataFrame(converter=converter, df=df, metadata=metadata)
with open(outpath, 'w') as f:
write_table(msdf, f)


# todo: for the SSSOM use case, it is weird to rename #CUI as xref_id. so maybe _get_mapping_set() should either not
Expand Down

0 comments on commit 0dd37ea

Please sign in to comment.