Skip to content

Commit

Permalink
SSSOM outputs
Browse files Browse the repository at this point in the history
- Add: Code for 2 new outputs: hpo-umls.sssom.tsv and hpo-mesh.sssom.tsv

General
- Add: Python requirement: PyYaml
- Update: ORCID in SSSOM config
- Bug fix: Edge case for missing intermediate file in pipeline
- Remove: Intentionally duplicative, no longer needed 'MEDGENCUI' xrefs
- Add: Refactor to load mapping set using function that handles lots of repetitive stuff: get_mapping_set()
  • Loading branch information
joeflack4 committed Mar 30, 2024
1 parent f6a77c4 commit bff4784
Show file tree
Hide file tree
Showing 8 changed files with 205 additions and 39 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/buid_and_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ jobs:
output/release/medgen.obo
output/release/medgen-disease-extract.obo
output/release/medgen-xrefs.robot.template.tsv
output/release/hpo-umls.sssom.tsv
output/release/hpo-mesh.sssom.tsv
32 changes: 20 additions & 12 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# MedGen ingest
# Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip
# that part. In order to force re-download, run `make all -B`.
# todo: remove parts of old make/perl pipeline no longer used
.DEFAULT_GOAL := all
.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal
.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal sssom

OBO=http://purl.obolibrary.org/obo
PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl
Expand All @@ -14,10 +15,10 @@ minimal: build-lite stage-lite clean
stage-lite: | output/release/
# mv medgen-disease-extract.owl output/release/
# mv medgen.sssom.tsv output/release/
mv medgen.obo output/release/
mv medgen-disease-extract.obo output/release/
mv medgen-xrefs.robot.template.tsv output/release/
build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv
mv *.obo output/release/
mv *.robot.template.tsv output/release/
mv *.sssom.tsv output/release/
build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv sssom

all: build stage clean analyze
# analyze: runs more than just this file; that goal creates multiple files
Expand Down Expand Up @@ -50,6 +51,11 @@ ftp.ncbi.nlm.nih.gov/:
uid2cui.tsv: ftp.ncbi.nlm.nih.gov/
./src/make_uid2cui.pl > $@

ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt: ftp.ncbi.nlm.nih.gov/
if [ -f "ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz" ]; then \
gzip -dk ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz; \
fi

# ----------------------------------------
# Main artefacts
# ----------------------------------------
Expand All @@ -73,11 +79,16 @@ medgen-disease-extract.owl: medgen-disease-extract.obo
owltools $< -o $@

# SSSOM ----------------------------------
medgen.obographs.json:
robot convert -i medgen-disease-extract.owl -o $@
# todo: comemented out old pipeline: remove
#medgen.obographs.json:
# robot convert -i medgen-disease-extract.owl -o $@
#
#medgen.sssom.tsv: medgen.obographs.json
# sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@
sssom: hpo-umls.sssom.tsv

medgen.sssom.tsv: medgen.obographs.json
sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@
hpo-umls.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt
python src/create_sssom.py --input-mappings $< --input-sssom-config config/medgen.sssom-metadata.yml

# ----------------------------------------
# Cycles
Expand Down Expand Up @@ -106,9 +117,6 @@ output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.tx
# ----------------------------------------
# Robot templates
# ----------------------------------------
ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt: ftp.ncbi.nlm.nih.gov/
gzip -d ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz

# todo: Ideally I wanted this done at the end of the ingest, permuting from medgen.sssom.tsv, but there were some
# problems with that file. Eventually changing to that feels like it makes more sense. Will have already been
# pre-curated by disease. And some of the logic in this Python script is duplicative.
Expand Down
1 change: 1 addition & 0 deletions requirements-unlocked.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pandas
pyyaml
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ pbr==5.11.1
platformdirs==3.1.0
python-dateutil==2.8.2
pytz==2023.3
PyYAML==6.0.1
six==1.16.0
stevedore==5.0.0
tzdata==2023.3
virtualenv==20.20.0
virtualenv-clone==0.5.7
virtualenvwrapper==4.8.4
1 change: 1 addition & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""MedGen"""
68 changes: 68 additions & 0 deletions src/create_sssom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Create SSSOM outputs"""
from argparse import ArgumentParser
from pathlib import Path

import pandas as pd

from utils import get_mapping_set, write_sssom

SRC_DIR = Path(__file__).parent
PROJECT_DIR = SRC_DIR.parent
FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen"
CONFIG_DIR = PROJECT_DIR / "config"
INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt")
INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml")
OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "hpo-umls.sssom.tsv")
OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv")


def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame:
"""FIlter dataframe by source and format columns."""
return df[df['source'] == source][['subject_id', 'subject_label', 'predicate_id', 'object_id']]


def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG):
"""Create SSSOM outputs"""
# SSSOM 1: HPO<->UMLS
df_hpo_umls = get_mapping_set(input_mappings, ['HPO'], add_prefixes=True)
write_sssom(df_hpo_umls, input_sssom_config, OUTPUT_FILE_HPO_UMLS)

# SSSOM 2: HPO<->MeSH
# - filter
df_hpo_mesh = get_mapping_set(input_mappings, ['MeSH'], add_prefixes=True)
# - JOIN data: some cols temporary for temporary report for non-matches
df_hpo_mesh = pd.merge(df_hpo_mesh, df_hpo_umls, on='subject_id', how='left').rename(columns={
'subject_id': 'umls_id',
'subject_label_x': 'umls_label',
'predicate_id_x': 'predicate_id',
'object_id_x': 'object_id',
'object_id_y': 'subject_id',
})
# -- sort cols & sort rows & drop unneeded cols (subject_label_y, predicate_id_y)
df_hpo_mesh = df_hpo_mesh[['subject_id', 'predicate_id', 'object_id', 'umls_id', 'umls_label']].sort_values(
['subject_id', 'object_id'], na_position='first')
# -- add missing prefixes
df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x)
# todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to
# move the col removals below (umls) to above
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-no-matches.sssom.tsv'))
# -- filter non-matches & drop unneeded cols
df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[
x for x in df_hpo_mesh.columns if not x.startswith('umls')]]
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH)


def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='Create SSSOM outputs',
description='Create SSSOM outputs from MedGen source')
parser.add_argument(
'-m', '--input-mappings', default=INPUT_MAPPINGS, help='Path to mapping file sourced from MedGen.')
parser.add_argument(
'-c', '--input-sssom-config', default=INPUT_CONFIG, help='Path to SSSOM config yml.')
run(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()
36 changes: 10 additions & 26 deletions src/mondo_robot_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
- Used here: https://github.com/monarch-initiative/mondo/pull/6560
"""
from argparse import ArgumentParser
from copy import copy
from pathlib import Path
from typing import Dict, List

import pandas as pd

from utils import get_mapping_set, add_prefixes_to_plain_id

SRC_DIR = Path(__file__).parent
PROJECT_DIR = SRC_DIR.parent
FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen"
Expand All @@ -25,37 +25,20 @@
}


def _prefixed_id_rows_from_common_df(source_df: pd.DataFrame, mondo_col='mondo_id', xref_col='xref_id') -> List[Dict]:
"""From worksheets having same common format, get prefixed xrefs for the namespaces we're looking to cover
Note: This same exact function is used in:
- mondo repo: medgen_conflicts_add_xrefs.py
- medgen repo: mondo_robot_template.py"""
df = copy(source_df)
df[xref_col] = df[xref_col].apply(
lambda x: f'MEDGENCUI:{x}' if x.startswith('CN') # "CUI Novel"
else f'UMLS:{x}' if x.startswith('C') # CUI 1 of 2: UMLS
else f'MEDGEN:{x}') # UID
rows = df.to_dict('records')
# CUI 2 of 2: MEDGENCUI:
rows2 = [{mondo_col: x[mondo_col], xref_col: x[xref_col].replace('UMLS', 'MEDGENCUI')} for x in rows if
x[xref_col].startswith('UMLS')]
return rows + rows2


def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
"""Create robot template"""
# Read input
df = pd.read_csv(input_file, sep='|').rename(columns={'#CUI': 'xref_id'})

df: pd.DataFrame = get_mapping_set(input_file)
# Get explicit Medgen (CUI, CN) -> Mondo mappings
df_medgen_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(columns={'source_id': 'mondo_id'})
out_df_cui_cn = pd.DataFrame(_prefixed_id_rows_from_common_df(df_medgen_mondo))
out_df_cui_cn = df_medgen_mondo.copy()
out_df_cui_cn['xref_id'] = out_df_cui_cn['xref_id'].apply(add_prefixes_to_plain_id)

# Get Medgen (UID) -> Mondo mappings
# - Done by proxy: UID <-> CUI <-> MONDO
df_medgen_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename(
columns={'source_id': 'medgen_uid'})
# todo: should some of these steps be in _reformat_mapping_set()? to be utilized by SSSOM files?
out_df_uid = pd.merge(df_medgen_mondo, df_medgen_medgenuid, on='xref_id').rename(
columns={'xref_id': 'source_id', 'medgen_uid': 'xref_id'})[['mondo_id', 'xref_id', 'source_id']]
out_df_uid['xref_id'] = out_df_uid['xref_id'].apply(lambda x: f'MEDGEN:{x}')
Expand All @@ -66,15 +49,16 @@ def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
out_df = pd.concat([pd.DataFrame([ROBOT_ROW_MAP]), out_df])
out_df.to_csv(output_file, index=False, sep='\t')


def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='"Medgen->Mondo robot template',
prog='Medgen->Mondo robot template',
description='Create a robot template to be used by Mondo to add MedGen xrefs curated by MedGen.')
parser.add_argument(
'-i', '--input-file', default=INPUT_FILE, help='Mapping file sourced from MedGen')
'-i', '--input-file', default=INPUT_FILE, help='Path to mapping file sourced from MedGen')
parser.add_argument(
'-o', '--output-file', default=OUTPUT_FILE, help='ROBOT template to be used to add xrefs')
'-o', '--output-file', default=OUTPUT_FILE, help='Path to ROBOT template to be used to add xrefs')
run(**vars(parser.parse_args()))


Expand Down
102 changes: 102 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Utils"""
import os
from pathlib import Path
from typing import List, Set, Union

import pandas as pd
import yaml


def add_prefixes_to_plain_id(x: str) -> str:
"""From plain IDs from originanl source, add prefixes.
Terms:
CN: stands for "CUI Novel". These are created for any MedGen records without UMLS CUI.
C: stands for "CUI". These are sourced from UMLS.
CUI: stands for "Concept Unique Identifier"
UID (Unique IDentifier): These are cases where the id is all digits; does not start with a leading alpha char.
"""
return f'MEDGENCUI:{x}' if x.startswith('CN') \
else f'UMLS:{x}' if x.startswith('C') \
else f'MEDGEN:{x}'


def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]:
"""Find prefixes in mapping set"""
df = source_df.copy()
cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id']
prefixes = set()
for col in cols_with_prefixes:
col2 = col.replace('id', 'prefix')
df[col2] = df[col].apply(lambda x: x.split(':')[0]
if isinstance(x, str) else x) # handles nan
prefixes.update(set(df[col2].to_list()))
return prefixes


def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]):
"""Writes a SSSOM file with commented metadata at the top of the file.
Filters only prefxes in curie_map that exist in the mapping set."""
temp_filtered_config_path = str(config_path) + '.tmp'
# Load config
config = yaml.safe_load(open(config_path, 'r'))
# Filter curie_map
prefixes: Set[str] = find_prefixes_in_mapping_set(df)
config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes}
# Write
with open(temp_filtered_config_path, 'w') as f:
yaml.dump(config, f)
write_tsv_with_comments(df, temp_filtered_config_path, outpath)
os.remove(temp_filtered_config_path)


def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]):
"""Write a TSV with comments at the top"""
# write metadata
f = open(comments_file, "r")
lines = f.readlines()
f.close()
output_lines = []
for line in lines:
output_lines.append("# " + line)
metadata_str = ''.join(output_lines)
if os.path.exists(outpath):
os.remove(outpath)
f = open(outpath, 'a')
f.write(metadata_str)
f.close()
# write data
df.to_csv(outpath, index=False, sep='\t', mode='a')


# todo: for the SSSOM use case, it is weird to rename #CUI as xref_id. so maybe _get_mapping_set() should either not
# common code for this and robot template, or add a param to not rename that col
def get_mapping_set(
inpath: Union[str, Path], filter_sources: List[str] = None, add_prefixes=False, sssomify=True,
) -> pd.DataFrame:
"""Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications."""
# Read
df = pd.read_csv(inpath, sep='|').rename(columns={'#CUI': 'xref_id'})
# Remove empty columns
empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row
if empty_cols:
df = df.drop(columns=empty_cols)
# Add prefixes
if add_prefixes:
df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id)
# Sort
df = df.sort_values(['xref_id', 'source_id'])
if filter_sources:
df = df[df['source'].isin(filter_sources)]
del df['source']
# Standardize to SSSOM
if sssomify:
df = df.rename(columns={
'xref_id': 'subject_id',
'pref_name': 'subject_label',
'source_id': 'object_id',
})
df['predicate_id'] = 'skos:exactMatch'
df = df[['subject_id', 'subject_label', 'predicate_id', 'object_id']]
return df

0 comments on commit bff4784

Please sign in to comment.