-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Add: Code for 2 new outputs: hpo-umls.sssom.tsv and hpo-mesh.sssom.tsv General - Add: Python requirement: PyYaml - Update: ORCID in SSSOM config - Bug fix: Edge case for missing intermediate file in pipeline - Remove: Intentionally duplicative, no longer needed 'MEDGENCUI' xrefs - Add: Refactor to load mapping set using function that handles lots of repetitive stuff: get_mapping_set()
- Loading branch information
Showing
8 changed files
with
205 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
pandas | ||
pyyaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""MedGen""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
"""Create SSSOM outputs""" | ||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
|
||
from utils import get_mapping_set, write_sssom | ||
|
||
SRC_DIR = Path(__file__).parent | ||
PROJECT_DIR = SRC_DIR.parent | ||
FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen" | ||
CONFIG_DIR = PROJECT_DIR / "config" | ||
INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt") | ||
INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml") | ||
OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "hpo-umls.sssom.tsv") | ||
OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv") | ||
|
||
|
||
def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame: | ||
"""FIlter dataframe by source and format columns.""" | ||
return df[df['source'] == source][['subject_id', 'subject_label', 'predicate_id', 'object_id']] | ||
|
||
|
||
def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG): | ||
"""Create SSSOM outputs""" | ||
# SSSOM 1: HPO<->UMLS | ||
df_hpo_umls = get_mapping_set(input_mappings, ['HPO'], add_prefixes=True) | ||
write_sssom(df_hpo_umls, input_sssom_config, OUTPUT_FILE_HPO_UMLS) | ||
|
||
# SSSOM 2: HPO<->MeSH | ||
# - filter | ||
df_hpo_mesh = get_mapping_set(input_mappings, ['MeSH'], add_prefixes=True) | ||
# - JOIN data: some cols temporary for temporary report for non-matches | ||
df_hpo_mesh = pd.merge(df_hpo_mesh, df_hpo_umls, on='subject_id', how='left').rename(columns={ | ||
'subject_id': 'umls_id', | ||
'subject_label_x': 'umls_label', | ||
'predicate_id_x': 'predicate_id', | ||
'object_id_x': 'object_id', | ||
'object_id_y': 'subject_id', | ||
}) | ||
# -- sort cols & sort rows & drop unneeded cols (subject_label_y, predicate_id_y) | ||
df_hpo_mesh = df_hpo_mesh[['subject_id', 'predicate_id', 'object_id', 'umls_id', 'umls_label']].sort_values( | ||
['subject_id', 'object_id'], na_position='first') | ||
# -- add missing prefixes | ||
df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x) | ||
# todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to | ||
# move the col removals below (umls) to above | ||
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-no-matches.sssom.tsv')) | ||
# -- filter non-matches & drop unneeded cols | ||
df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[ | ||
x for x in df_hpo_mesh.columns if not x.startswith('umls')]] | ||
write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH) | ||
|
||
|
||
def cli(): | ||
"""Command line interface.""" | ||
parser = ArgumentParser( | ||
prog='Create SSSOM outputs', | ||
description='Create SSSOM outputs from MedGen source') | ||
parser.add_argument( | ||
'-m', '--input-mappings', default=INPUT_MAPPINGS, help='Path to mapping file sourced from MedGen.') | ||
parser.add_argument( | ||
'-c', '--input-sssom-config', default=INPUT_CONFIG, help='Path to SSSOM config yml.') | ||
run(**vars(parser.parse_args())) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
"""Utils""" | ||
import os | ||
from pathlib import Path | ||
from typing import List, Set, Union | ||
|
||
import pandas as pd | ||
import yaml | ||
|
||
|
||
def add_prefixes_to_plain_id(x: str) -> str: | ||
"""From plain IDs from originanl source, add prefixes. | ||
Terms: | ||
CN: stands for "CUI Novel". These are created for any MedGen records without UMLS CUI. | ||
C: stands for "CUI". These are sourced from UMLS. | ||
CUI: stands for "Concept Unique Identifier" | ||
UID (Unique IDentifier): These are cases where the id is all digits; does not start with a leading alpha char. | ||
""" | ||
return f'MEDGENCUI:{x}' if x.startswith('CN') \ | ||
else f'UMLS:{x}' if x.startswith('C') \ | ||
else f'MEDGEN:{x}' | ||
|
||
|
||
def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]: | ||
"""Find prefixes in mapping set""" | ||
df = source_df.copy() | ||
cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id'] | ||
prefixes = set() | ||
for col in cols_with_prefixes: | ||
col2 = col.replace('id', 'prefix') | ||
df[col2] = df[col].apply(lambda x: x.split(':')[0] | ||
if isinstance(x, str) else x) # handles nan | ||
prefixes.update(set(df[col2].to_list())) | ||
return prefixes | ||
|
||
|
||
def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]): | ||
"""Writes a SSSOM file with commented metadata at the top of the file. | ||
Filters only prefxes in curie_map that exist in the mapping set.""" | ||
temp_filtered_config_path = str(config_path) + '.tmp' | ||
# Load config | ||
config = yaml.safe_load(open(config_path, 'r')) | ||
# Filter curie_map | ||
prefixes: Set[str] = find_prefixes_in_mapping_set(df) | ||
config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes} | ||
# Write | ||
with open(temp_filtered_config_path, 'w') as f: | ||
yaml.dump(config, f) | ||
write_tsv_with_comments(df, temp_filtered_config_path, outpath) | ||
os.remove(temp_filtered_config_path) | ||
|
||
|
||
def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]): | ||
"""Write a TSV with comments at the top""" | ||
# write metadata | ||
f = open(comments_file, "r") | ||
lines = f.readlines() | ||
f.close() | ||
output_lines = [] | ||
for line in lines: | ||
output_lines.append("# " + line) | ||
metadata_str = ''.join(output_lines) | ||
if os.path.exists(outpath): | ||
os.remove(outpath) | ||
f = open(outpath, 'a') | ||
f.write(metadata_str) | ||
f.close() | ||
# write data | ||
df.to_csv(outpath, index=False, sep='\t', mode='a') | ||
|
||
|
||
# todo: for the SSSOM use case, it is weird to rename #CUI as xref_id. so maybe _get_mapping_set() should either not | ||
# common code for this and robot template, or add a param to not rename that col | ||
def get_mapping_set( | ||
inpath: Union[str, Path], filter_sources: List[str] = None, add_prefixes=False, sssomify=True, | ||
) -> pd.DataFrame: | ||
"""Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications.""" | ||
# Read | ||
df = pd.read_csv(inpath, sep='|').rename(columns={'#CUI': 'xref_id'}) | ||
# Remove empty columns | ||
empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row | ||
if empty_cols: | ||
df = df.drop(columns=empty_cols) | ||
# Add prefixes | ||
if add_prefixes: | ||
df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id) | ||
# Sort | ||
df = df.sort_values(['xref_id', 'source_id']) | ||
if filter_sources: | ||
df = df[df['source'].isin(filter_sources)] | ||
del df['source'] | ||
# Standardize to SSSOM | ||
if sssomify: | ||
df = df.rename(columns={ | ||
'xref_id': 'subject_id', | ||
'pref_name': 'subject_label', | ||
'source_id': 'object_id', | ||
}) | ||
df['predicate_id'] = 'skos:exactMatch' | ||
df = df[['subject_id', 'subject_label', 'predicate_id', 'object_id']] | ||
return df |