-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SSSOM outputs & robot template tweaks
SSSOM outputs - Add: Code for 2 new outputs: hpo-umls.sssom.tsv and hpo-mesh.sssom.tsv Robot template tweaks - Remove: Intentionally duplicative, no longer needed 'MEDGENCUI' xrefs WIP General - Add: Python requirement: PyYaml - Update: ORCID in SSSOM config
- Loading branch information
Showing
7 changed files
with
191 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
pandas | ||
pyyaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
"""Create SSSOM outputs""" | ||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
|
||
from src.utils import _get_mapping_set, write_sssom | ||
|
||
SRC_DIR = Path(__file__).parent | ||
PROJECT_DIR = SRC_DIR.parent | ||
FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen" | ||
CONFIG_DIR = PROJECT_DIR / "config" | ||
INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt") | ||
INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml") | ||
OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "hpo-umls.sssom.tsv") | ||
OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv") | ||
|
||
|
||
def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame: | ||
"""FIlter dataframe by source and format columns.""" | ||
return df[df['source'] == source][['subject_id', 'subject_label', 'predicate_id', 'object_id']] | ||
|
||
|
||
def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG): | ||
"""Create SSSOM outputs""" | ||
# Read input | ||
# todo: for the SSSOM use case, it is weird to rename #CUI as xref_id. so maybe _get_mapping_set() should either not | ||
# common code for this and robot template, or add a param to not rename that col | ||
source_df: pd.DataFrame = _get_mapping_set(input_mappings, add_prefixes=True).rename(columns={ | ||
'xref_id': 'subject_id', | ||
'pref_name': 'subject_label', | ||
'source_id': 'object_id', | ||
}) | ||
source_df['predicate_id'] = 'skos:exactMatch' | ||
|
||
# SSSOM 1: HPO<->UMLS | ||
df_hpo_umls = _filter_and_format_cols(source_df, 'HPO').sort_values(['subject_id', 'object_id']) | ||
write_sssom(df_hpo_umls, input_sssom_config, OUTPUT_FILE_HPO_UMLS) | ||
|
||
# SSSOM 2: HPO<->MeSH | ||
# - filter | ||
df_hpo_mesh = _filter_and_format_cols(source_df, 'MeSH') | ||
# - JOIN data: some cols temporary for temporary report for non-matches | ||
df_hpo_mesh = pd.merge(df_hpo_mesh, df_hpo_umls, on='subject_id', how='left').rename(columns={ | ||
'subject_id': 'umls_id', | ||
'subject_label_x': 'umls_label', | ||
'predicate_id_x': 'predicate_id', | ||
'object_id_x': 'object_id', | ||
'object_id_y': 'subject_id', | ||
}) | ||
# -- sort cols & sort rows & drop unneeded cols (subject_label_y, predicate_id_y) | ||
df_hpo_mesh = df_hpo_mesh[['subject_id', 'predicate_id', 'object_id', 'umls_id', 'umls_label']].sort_values( | ||
['subject_id', 'object_id'], na_position='first') | ||
# -- add missing prefixes | ||
df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x) | ||
# todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to | ||
# move the col removals below (umls) to above | ||
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-no-matches.sssom.tsv')) | ||
# -- filter non-matches & drop unneeded cols | ||
df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[ | ||
x for x in df_hpo_mesh.columns if not x.startswith('umls')]] | ||
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH) | ||
|
||
|
||
def cli(): | ||
"""Command line interface.""" | ||
parser = ArgumentParser( | ||
prog='Create SSSOM outputs', | ||
description='Create SSSOM outputs from MedGen source') | ||
parser.add_argument( | ||
'-m', '--input-mappings', default=INPUT_MAPPINGS, help='Mapping file sourced from MedGen.') | ||
parser.add_argument( | ||
'-c', '--input-sssom-config', default=INPUT_CONFIG, help='SSSOM config yml.') | ||
run(**vars(parser.parse_args())) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
"""Utils""" | ||
import os | ||
from pathlib import Path | ||
from typing import Set, Union | ||
|
||
import pandas as pd | ||
import yaml | ||
|
||
|
||
def add_prefixes_to_plain_id(x: str) -> str: | ||
"""From plain IDs from originanl source, add prefixes. | ||
Terms: | ||
CN: stands for "CUI Novel". These are created for any MedGen records without UMLS CUI. | ||
C: stands for "CUI". These are sourced from UMLS. | ||
CUI: stands for "Concept Unique Identifier" | ||
UID (Unique IDentifier): These are cases where the id is all digits; does not start with a leading alpha char. | ||
""" | ||
return f'MEDGENCUI:{x}' if x.startswith('CN') \ | ||
else f'UMLS:{x}' if x.startswith('C') \ | ||
else f'MEDGEN:{x}' | ||
|
||
|
||
def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]: | ||
"""Find prefixes in mapping set""" | ||
df = source_df.copy() | ||
cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id'] | ||
prefixes = set() | ||
for col in cols_with_prefixes: | ||
col2 = col.replace('id', 'prefix') | ||
df[col2] = df[col].apply(lambda x: x.split(':')[0] | ||
if isinstance(x, str) else x) # handles nan | ||
prefixes.update(set(df[col2].to_list())) | ||
return prefixes | ||
|
||
|
||
def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]): | ||
"""Writes a SSSOM file with commented metadata at the top of the file. | ||
Filters only prefxes in curie_map that exist in the mapping set.""" | ||
temp_filtered_config_path = str(config_path) + '.tmp' | ||
# Load config | ||
config = yaml.safe_load(open(config_path, 'r')) | ||
# TODO: filter by prefixes in df | ||
# - filter them finally | ||
# Filter curie_map | ||
prefixes: Set[str] = find_prefixes_in_mapping_set(df) | ||
config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes} | ||
# Write | ||
with open(temp_filtered_config_path, 'w') as f: | ||
yaml.dump(config, f) | ||
write_tsv_with_comments(df, temp_filtered_config_path, outpath) | ||
os.remove(temp_filtered_config_path) | ||
|
||
|
||
def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]): | ||
"""Write a TSV with comments at the top""" | ||
# write metadata | ||
f = open(comments_file, "r") | ||
lines = f.readlines() | ||
f.close() | ||
output_lines = [] | ||
for line in lines: | ||
output_lines.append("# " + line) | ||
metadata_str = ''.join(output_lines) | ||
if os.path.exists(outpath): | ||
os.remove(outpath) | ||
f = open(outpath, 'a') | ||
f.write(metadata_str) | ||
f.close() | ||
# write data | ||
df.to_csv(outpath, index=False, sep='\t', mode='a') | ||
|
||
|
||
def _get_mapping_set(inpath: Union[str, Path], add_prefixes=False) -> pd.DataFrame: | ||
"""Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications. | ||
Mods: Rename column, possibly add prefixes, remove superflous empty column.""" | ||
df = pd.read_csv(inpath, sep='|').rename(columns={'#CUI': 'xref_id'}) | ||
empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row | ||
if empty_cols: | ||
df = df.drop(columns=empty_cols) | ||
if add_prefixes: | ||
df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id) | ||
return df |