Skip to content

Commit

Permalink
SSSOM outputs & robot template tweaks
Browse files Browse the repository at this point in the history
SSSOM outputs
- Add: Code for 2 new outputs: hpo-umls.sssom.tsv and hpo-mesh.sssom.tsv
Robot template tweaks
- Remove: Intentionally duplicative, no longer needed 'MEDGENCUI' xrefs
WIP

General
- Add: Python requirement: PyYaml
- Update: ORCID in SSSOM config
  • Loading branch information
joeflack4 committed Mar 24, 2024
1 parent f6a77c4 commit 4ac6825
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 35 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/buid_and_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ jobs:
output/release/medgen.obo
output/release/medgen-disease-extract.obo
output/release/medgen-xrefs.robot.template.tsv
output/release/hpo-umls.sssom.tsv
output/release/hpo-mesh.sssom.tsv
26 changes: 16 additions & 10 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# MedGen ingest
# Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip
# that part. In order to force re-download, run `make all -B`.
# todo: remove parts of old make/perl pipeline no longer used
.DEFAULT_GOAL := all
.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal
.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal sssom

OBO=http://purl.obolibrary.org/obo
PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl
Expand All @@ -14,10 +15,10 @@ minimal: build-lite stage-lite clean
stage-lite: | output/release/
# mv medgen-disease-extract.owl output/release/
# mv medgen.sssom.tsv output/release/
mv medgen.obo output/release/
mv medgen-disease-extract.obo output/release/
mv medgen-xrefs.robot.template.tsv output/release/
build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv
mv *.obo output/release/
mv *.robot.template.tsv output/release/
mv *.sssom.tsv output/release/
build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv sssom

all: build stage clean analyze
# analyze: runs more than just this file; that goal creates multiple files
Expand Down Expand Up @@ -73,11 +74,16 @@ medgen-disease-extract.owl: medgen-disease-extract.obo
owltools $< -o $@

# SSSOM ----------------------------------
medgen.obographs.json:
robot convert -i medgen-disease-extract.owl -o $@

medgen.sssom.tsv: medgen.obographs.json
sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@
# todo: comemented out old pipeline: remove
#medgen.obographs.json:
# robot convert -i medgen-disease-extract.owl -o $@
#
#medgen.sssom.tsv: medgen.obographs.json
# sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@
sssom: hpo-umls.sssom.tsv

hpo-umls.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt
python src/create_sssom.py --input-mappings $< --input-sssom-config config/medgen.sssom-metadata.yml

# ----------------------------------------
# Cycles
Expand Down
1 change: 1 addition & 0 deletions requirements-unlocked.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pandas
pyyaml
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ pbr==5.11.1
platformdirs==3.1.0
python-dateutil==2.8.2
pytz==2023.3
PyYAML==6.0.1
six==1.16.0
stevedore==5.0.0
tzdata==2023.3
virtualenv==20.20.0
virtualenv-clone==0.5.7
virtualenvwrapper==4.8.4
78 changes: 78 additions & 0 deletions src/create_sssom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Create SSSOM outputs"""
from argparse import ArgumentParser
from pathlib import Path

import pandas as pd

from src.utils import _get_mapping_set, write_sssom

SRC_DIR = Path(__file__).parent
PROJECT_DIR = SRC_DIR.parent
FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen"
CONFIG_DIR = PROJECT_DIR / "config"
INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt")
INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml")
OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "hpo-umls.sssom.tsv")
OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv")


def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame:
"""FIlter dataframe by source and format columns."""
return df[df['source'] == source][['subject_id', 'subject_label', 'predicate_id', 'object_id']]


def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG):
"""Create SSSOM outputs"""
# Read input
# todo: for the SSSOM use case, it is weird to rename #CUI as xref_id. so maybe _get_mapping_set() should either not
# common code for this and robot template, or add a param to not rename that col
source_df: pd.DataFrame = _get_mapping_set(input_mappings, add_prefixes=True).rename(columns={
'xref_id': 'subject_id',
'pref_name': 'subject_label',
'source_id': 'object_id',
})
source_df['predicate_id'] = 'skos:exactMatch'

# SSSOM 1: HPO<->UMLS
df_hpo_umls = _filter_and_format_cols(source_df, 'HPO').sort_values(['subject_id', 'object_id'])
write_sssom(df_hpo_umls, input_sssom_config, OUTPUT_FILE_HPO_UMLS)

# SSSOM 2: HPO<->MeSH
# - filter
df_hpo_mesh = _filter_and_format_cols(source_df, 'MeSH')
# - JOIN data: some cols temporary for temporary report for non-matches
df_hpo_mesh = pd.merge(df_hpo_mesh, df_hpo_umls, on='subject_id', how='left').rename(columns={
'subject_id': 'umls_id',
'subject_label_x': 'umls_label',
'predicate_id_x': 'predicate_id',
'object_id_x': 'object_id',
'object_id_y': 'subject_id',
})
# -- sort cols & sort rows & drop unneeded cols (subject_label_y, predicate_id_y)
df_hpo_mesh = df_hpo_mesh[['subject_id', 'predicate_id', 'object_id', 'umls_id', 'umls_label']].sort_values(
['subject_id', 'object_id'], na_position='first')
# -- add missing prefixes
df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x)
# todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to
# move the col removals below (umls) to above
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-no-matches.sssom.tsv'))
# -- filter non-matches & drop unneeded cols
df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[
x for x in df_hpo_mesh.columns if not x.startswith('umls')]]
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH)


def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='Create SSSOM outputs',
description='Create SSSOM outputs from MedGen source')
parser.add_argument(
'-m', '--input-mappings', default=INPUT_MAPPINGS, help='Mapping file sourced from MedGen.')
parser.add_argument(
'-c', '--input-sssom-config', default=INPUT_CONFIG, help='SSSOM config yml.')
run(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()
32 changes: 8 additions & 24 deletions src/mondo_robot_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
- Used here: https://github.com/monarch-initiative/mondo/pull/6560
"""
from argparse import ArgumentParser
from copy import copy
from pathlib import Path
from typing import Dict, List

import pandas as pd

from src.utils import _get_mapping_set, add_prefixes_to_plain_id

SRC_DIR = Path(__file__).parent
PROJECT_DIR = SRC_DIR.parent
FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen"
Expand All @@ -25,37 +25,20 @@
}


def _prefixed_id_rows_from_common_df(source_df: pd.DataFrame, mondo_col='mondo_id', xref_col='xref_id') -> List[Dict]:
"""From worksheets having same common format, get prefixed xrefs for the namespaces we're looking to cover
Note: This same exact function is used in:
- mondo repo: medgen_conflicts_add_xrefs.py
- medgen repo: mondo_robot_template.py"""
df = copy(source_df)
df[xref_col] = df[xref_col].apply(
lambda x: f'MEDGENCUI:{x}' if x.startswith('CN') # "CUI Novel"
else f'UMLS:{x}' if x.startswith('C') # CUI 1 of 2: UMLS
else f'MEDGEN:{x}') # UID
rows = df.to_dict('records')
# CUI 2 of 2: MEDGENCUI:
rows2 = [{mondo_col: x[mondo_col], xref_col: x[xref_col].replace('UMLS', 'MEDGENCUI')} for x in rows if
x[xref_col].startswith('UMLS')]
return rows + rows2


def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
"""Create robot template"""
# Read input
df = pd.read_csv(input_file, sep='|').rename(columns={'#CUI': 'xref_id'})

df: pd.DataFrame = _get_mapping_set(input_file)
# Get explicit Medgen (CUI, CN) -> Mondo mappings
df_medgen_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(columns={'source_id': 'mondo_id'})
out_df_cui_cn = pd.DataFrame(_prefixed_id_rows_from_common_df(df_medgen_mondo))
out_df_cui_cn = df_medgen_mondo.copy()
out_df_cui_cn['xref_id'] = out_df_cui_cn['xref_id'].apply(add_prefixes_to_plain_id)

# Get Medgen (UID) -> Mondo mappings
# - Done by proxy: UID <-> CUI <-> MONDO
df_medgen_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename(
columns={'source_id': 'medgen_uid'})
# todo: should some of these steps be in _reformat_mapping_set()? to be utilized by SSSOM files?
out_df_uid = pd.merge(df_medgen_mondo, df_medgen_medgenuid, on='xref_id').rename(
columns={'xref_id': 'source_id', 'medgen_uid': 'xref_id'})[['mondo_id', 'xref_id', 'source_id']]
out_df_uid['xref_id'] = out_df_uid['xref_id'].apply(lambda x: f'MEDGEN:{x}')
Expand All @@ -66,10 +49,11 @@ def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
out_df = pd.concat([pd.DataFrame([ROBOT_ROW_MAP]), out_df])
out_df.to_csv(output_file, index=False, sep='\t')


def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='"Medgen->Mondo robot template',
prog='Medgen->Mondo robot template',
description='Create a robot template to be used by Mondo to add MedGen xrefs curated by MedGen.')
parser.add_argument(
'-i', '--input-file', default=INPUT_FILE, help='Mapping file sourced from MedGen')
Expand Down
85 changes: 85 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Utils"""
import os
from pathlib import Path
from typing import Set, Union

import pandas as pd
import yaml


def add_prefixes_to_plain_id(x: str) -> str:
"""From plain IDs from originanl source, add prefixes.
Terms:
CN: stands for "CUI Novel". These are created for any MedGen records without UMLS CUI.
C: stands for "CUI". These are sourced from UMLS.
CUI: stands for "Concept Unique Identifier"
UID (Unique IDentifier): These are cases where the id is all digits; does not start with a leading alpha char.
"""
return f'MEDGENCUI:{x}' if x.startswith('CN') \
else f'UMLS:{x}' if x.startswith('C') \
else f'MEDGEN:{x}'


def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]:
"""Find prefixes in mapping set"""
df = source_df.copy()
cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id']
prefixes = set()
for col in cols_with_prefixes:
col2 = col.replace('id', 'prefix')
df[col2] = df[col].apply(lambda x: x.split(':')[0]
if isinstance(x, str) else x) # handles nan
prefixes.update(set(df[col2].to_list()))
return prefixes


def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]):
"""Writes a SSSOM file with commented metadata at the top of the file.
Filters only prefxes in curie_map that exist in the mapping set."""
temp_filtered_config_path = str(config_path) + '.tmp'
# Load config
config = yaml.safe_load(open(config_path, 'r'))
# TODO: filter by prefixes in df
# - filter them finally
# Filter curie_map
prefixes: Set[str] = find_prefixes_in_mapping_set(df)
config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes}
# Write
with open(temp_filtered_config_path, 'w') as f:
yaml.dump(config, f)
write_tsv_with_comments(df, temp_filtered_config_path, outpath)
os.remove(temp_filtered_config_path)


def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]):
"""Write a TSV with comments at the top"""
# write metadata
f = open(comments_file, "r")
lines = f.readlines()
f.close()
output_lines = []
for line in lines:
output_lines.append("# " + line)
metadata_str = ''.join(output_lines)
if os.path.exists(outpath):
os.remove(outpath)
f = open(outpath, 'a')
f.write(metadata_str)
f.close()
# write data
df.to_csv(outpath, index=False, sep='\t', mode='a')


def _get_mapping_set(inpath: Union[str, Path], add_prefixes=False) -> pd.DataFrame:
"""Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications.
Mods: Rename column, possibly add prefixes, remove superflous empty column."""
df = pd.read_csv(inpath, sep='|').rename(columns={'#CUI': 'xref_id'})
empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row
if empty_cols:
df = df.drop(columns=empty_cols)
if add_prefixes:
df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id)
return df

0 comments on commit 4ac6825

Please sign in to comment.