From 4ac68252f26963b9c197cf7961c3fa0438d9fcf7 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Fri, 22 Mar 2024 19:33:30 -0400 Subject: [PATCH] SSSOM outputs & robot template tweaks SSSOM outputs - Add: Code for 2 new outputs: hpo-umls.sssom.tsv and hpo-mesh.sssom.tsv Robot template tweaks - Remove: Intentionally duplicative, no longer needed 'MEDGENCUI' xrefs WIP General - Add: Python requirement: PyYaml - Update: ORCID in SSSOM config --- .github/workflows/buid_and_release.yml | 2 + makefile | 26 +++++--- requirements-unlocked.txt | 1 + requirements.txt | 2 +- src/create_sssom.py | 78 +++++++++++++++++++++++ src/mondo_robot_template.py | 32 +++------- src/utils.py | 85 ++++++++++++++++++++++++++ 7 files changed, 191 insertions(+), 35 deletions(-) create mode 100644 src/create_sssom.py create mode 100644 src/utils.py diff --git a/.github/workflows/buid_and_release.yml b/.github/workflows/buid_and_release.yml index 997bc9d..f7ec3ee 100644 --- a/.github/workflows/buid_and_release.yml +++ b/.github/workflows/buid_and_release.yml @@ -39,3 +39,5 @@ jobs: output/release/medgen.obo output/release/medgen-disease-extract.obo output/release/medgen-xrefs.robot.template.tsv + output/release/hpo-umls.sssom.tsv + output/release/hpo-mesh.sssom.tsv diff --git a/makefile b/makefile index 460bd49..42f6d2c 100644 --- a/makefile +++ b/makefile @@ -1,8 +1,9 @@ # MedGen ingest # Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip # that part. In order to force re-download, run `make all -B`. +# todo: remove parts of old make/perl pipeline no longer used .DEFAULT_GOAL := all -.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal +.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal sssom OBO=http://purl.obolibrary.org/obo PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl @@ -14,10 +15,10 @@ minimal: build-lite stage-lite clean stage-lite: | output/release/ # mv medgen-disease-extract.owl output/release/ # mv medgen.sssom.tsv output/release/ - mv medgen.obo output/release/ - mv medgen-disease-extract.obo output/release/ - mv medgen-xrefs.robot.template.tsv output/release/ -build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv + mv *.obo output/release/ + mv *.robot.template.tsv output/release/ + mv *.sssom.tsv output/release/ +build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv sssom all: build stage clean analyze # analyze: runs more than just this file; that goal creates multiple files @@ -73,11 +74,16 @@ medgen-disease-extract.owl: medgen-disease-extract.obo owltools $< -o $@ # SSSOM ---------------------------------- -medgen.obographs.json: - robot convert -i medgen-disease-extract.owl -o $@ - -medgen.sssom.tsv: medgen.obographs.json - sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@ +# todo: comemented out old pipeline: remove +#medgen.obographs.json: +# robot convert -i medgen-disease-extract.owl -o $@ +# +#medgen.sssom.tsv: medgen.obographs.json +# sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@ +sssom: hpo-umls.sssom.tsv + +hpo-umls.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt + python src/create_sssom.py --input-mappings $< --input-sssom-config config/medgen.sssom-metadata.yml # ---------------------------------------- # Cycles diff --git a/requirements-unlocked.txt b/requirements-unlocked.txt index fb6c7ed..b052e87 100644 --- a/requirements-unlocked.txt +++ b/requirements-unlocked.txt @@ -1 +1,2 @@ pandas +pyyaml diff --git a/requirements.txt b/requirements.txt index 01e7f87..6f194b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,9 +6,9 @@ pbr==5.11.1 platformdirs==3.1.0 python-dateutil==2.8.2 pytz==2023.3 +PyYAML==6.0.1 six==1.16.0 stevedore==5.0.0 tzdata==2023.3 virtualenv==20.20.0 virtualenv-clone==0.5.7 -virtualenvwrapper==4.8.4 diff --git a/src/create_sssom.py b/src/create_sssom.py new file mode 100644 index 0000000..65fe316 --- /dev/null +++ b/src/create_sssom.py @@ -0,0 +1,78 @@ +"""Create SSSOM outputs""" +from argparse import ArgumentParser +from pathlib import Path + +import pandas as pd + +from src.utils import _get_mapping_set, write_sssom + +SRC_DIR = Path(__file__).parent +PROJECT_DIR = SRC_DIR.parent +FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen" +CONFIG_DIR = PROJECT_DIR / "config" +INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt") +INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml") +OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "hpo-umls.sssom.tsv") +OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv") + + +def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame: + """FIlter dataframe by source and format columns.""" + return df[df['source'] == source][['subject_id', 'subject_label', 'predicate_id', 'object_id']] + + +def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG): + """Create SSSOM outputs""" + # Read input + # todo: for the SSSOM use case, it is weird to rename #CUI as xref_id. so maybe _get_mapping_set() should either not + # common code for this and robot template, or add a param to not rename that col + source_df: pd.DataFrame = _get_mapping_set(input_mappings, add_prefixes=True).rename(columns={ + 'xref_id': 'subject_id', + 'pref_name': 'subject_label', + 'source_id': 'object_id', + }) + source_df['predicate_id'] = 'skos:exactMatch' + + # SSSOM 1: HPO<->UMLS + df_hpo_umls = _filter_and_format_cols(source_df, 'HPO').sort_values(['subject_id', 'object_id']) + write_sssom(df_hpo_umls, input_sssom_config, OUTPUT_FILE_HPO_UMLS) + + # SSSOM 2: HPO<->MeSH + # - filter + df_hpo_mesh = _filter_and_format_cols(source_df, 'MeSH') + # - JOIN data: some cols temporary for temporary report for non-matches + df_hpo_mesh = pd.merge(df_hpo_mesh, df_hpo_umls, on='subject_id', how='left').rename(columns={ + 'subject_id': 'umls_id', + 'subject_label_x': 'umls_label', + 'predicate_id_x': 'predicate_id', + 'object_id_x': 'object_id', + 'object_id_y': 'subject_id', + }) + # -- sort cols & sort rows & drop unneeded cols (subject_label_y, predicate_id_y) + df_hpo_mesh = df_hpo_mesh[['subject_id', 'predicate_id', 'object_id', 'umls_id', 'umls_label']].sort_values( + ['subject_id', 'object_id'], na_position='first') + # -- add missing prefixes + df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x) + # todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to + # move the col removals below (umls) to above + write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-no-matches.sssom.tsv')) + # -- filter non-matches & drop unneeded cols + df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[ + x for x in df_hpo_mesh.columns if not x.startswith('umls')]] + write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH) + + +def cli(): + """Command line interface.""" + parser = ArgumentParser( + prog='Create SSSOM outputs', + description='Create SSSOM outputs from MedGen source') + parser.add_argument( + '-m', '--input-mappings', default=INPUT_MAPPINGS, help='Mapping file sourced from MedGen.') + parser.add_argument( + '-c', '--input-sssom-config', default=INPUT_CONFIG, help='SSSOM config yml.') + run(**vars(parser.parse_args())) + + +if __name__ == '__main__': + cli() diff --git a/src/mondo_robot_template.py b/src/mondo_robot_template.py index 245bf89..c67955d 100644 --- a/src/mondo_robot_template.py +++ b/src/mondo_robot_template.py @@ -7,12 +7,12 @@ - Used here: https://github.com/monarch-initiative/mondo/pull/6560 """ from argparse import ArgumentParser -from copy import copy from pathlib import Path -from typing import Dict, List import pandas as pd +from src.utils import _get_mapping_set, add_prefixes_to_plain_id + SRC_DIR = Path(__file__).parent PROJECT_DIR = SRC_DIR.parent FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen" @@ -25,37 +25,20 @@ } -def _prefixed_id_rows_from_common_df(source_df: pd.DataFrame, mondo_col='mondo_id', xref_col='xref_id') -> List[Dict]: - """From worksheets having same common format, get prefixed xrefs for the namespaces we're looking to cover - - Note: This same exact function is used in: - - mondo repo: medgen_conflicts_add_xrefs.py - - medgen repo: mondo_robot_template.py""" - df = copy(source_df) - df[xref_col] = df[xref_col].apply( - lambda x: f'MEDGENCUI:{x}' if x.startswith('CN') # "CUI Novel" - else f'UMLS:{x}' if x.startswith('C') # CUI 1 of 2: UMLS - else f'MEDGEN:{x}') # UID - rows = df.to_dict('records') - # CUI 2 of 2: MEDGENCUI: - rows2 = [{mondo_col: x[mondo_col], xref_col: x[xref_col].replace('UMLS', 'MEDGENCUI')} for x in rows if - x[xref_col].startswith('UMLS')] - return rows + rows2 - - def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE): """Create robot template""" # Read input - df = pd.read_csv(input_file, sep='|').rename(columns={'#CUI': 'xref_id'}) - + df: pd.DataFrame = _get_mapping_set(input_file) # Get explicit Medgen (CUI, CN) -> Mondo mappings df_medgen_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(columns={'source_id': 'mondo_id'}) - out_df_cui_cn = pd.DataFrame(_prefixed_id_rows_from_common_df(df_medgen_mondo)) + out_df_cui_cn = df_medgen_mondo.copy() + out_df_cui_cn['xref_id'] = out_df_cui_cn['xref_id'].apply(add_prefixes_to_plain_id) # Get Medgen (UID) -> Mondo mappings # - Done by proxy: UID <-> CUI <-> MONDO df_medgen_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename( columns={'source_id': 'medgen_uid'}) + # todo: should some of these steps be in _reformat_mapping_set()? to be utilized by SSSOM files? out_df_uid = pd.merge(df_medgen_mondo, df_medgen_medgenuid, on='xref_id').rename( columns={'xref_id': 'source_id', 'medgen_uid': 'xref_id'})[['mondo_id', 'xref_id', 'source_id']] out_df_uid['xref_id'] = out_df_uid['xref_id'].apply(lambda x: f'MEDGEN:{x}') @@ -66,10 +49,11 @@ def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE): out_df = pd.concat([pd.DataFrame([ROBOT_ROW_MAP]), out_df]) out_df.to_csv(output_file, index=False, sep='\t') + def cli(): """Command line interface.""" parser = ArgumentParser( - prog='"Medgen->Mondo robot template', + prog='Medgen->Mondo robot template', description='Create a robot template to be used by Mondo to add MedGen xrefs curated by MedGen.') parser.add_argument( '-i', '--input-file', default=INPUT_FILE, help='Mapping file sourced from MedGen') diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..fa60ba8 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,85 @@ +"""Utils""" +import os +from pathlib import Path +from typing import Set, Union + +import pandas as pd +import yaml + + +def add_prefixes_to_plain_id(x: str) -> str: + """From plain IDs from originanl source, add prefixes. + + Terms: + CN: stands for "CUI Novel". These are created for any MedGen records without UMLS CUI. + C: stands for "CUI". These are sourced from UMLS. + CUI: stands for "Concept Unique Identifier" + UID (Unique IDentifier): These are cases where the id is all digits; does not start with a leading alpha char. + """ + return f'MEDGENCUI:{x}' if x.startswith('CN') \ + else f'UMLS:{x}' if x.startswith('C') \ + else f'MEDGEN:{x}' + + +def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]: + """Find prefixes in mapping set""" + df = source_df.copy() + cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id'] + prefixes = set() + for col in cols_with_prefixes: + col2 = col.replace('id', 'prefix') + df[col2] = df[col].apply(lambda x: x.split(':')[0] + if isinstance(x, str) else x) # handles nan + prefixes.update(set(df[col2].to_list())) + return prefixes + + +def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]): + """Writes a SSSOM file with commented metadata at the top of the file. + + Filters only prefxes in curie_map that exist in the mapping set.""" + temp_filtered_config_path = str(config_path) + '.tmp' + # Load config + config = yaml.safe_load(open(config_path, 'r')) + # TODO: filter by prefixes in df + # - filter them finally + # Filter curie_map + prefixes: Set[str] = find_prefixes_in_mapping_set(df) + config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes} + # Write + with open(temp_filtered_config_path, 'w') as f: + yaml.dump(config, f) + write_tsv_with_comments(df, temp_filtered_config_path, outpath) + os.remove(temp_filtered_config_path) + + +def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]): + """Write a TSV with comments at the top""" + # write metadata + f = open(comments_file, "r") + lines = f.readlines() + f.close() + output_lines = [] + for line in lines: + output_lines.append("# " + line) + metadata_str = ''.join(output_lines) + if os.path.exists(outpath): + os.remove(outpath) + f = open(outpath, 'a') + f.write(metadata_str) + f.close() + # write data + df.to_csv(outpath, index=False, sep='\t', mode='a') + + +def _get_mapping_set(inpath: Union[str, Path], add_prefixes=False) -> pd.DataFrame: + """Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications. + + Mods: Rename column, possibly add prefixes, remove superflous empty column.""" + df = pd.read_csv(inpath, sep='|').rename(columns={'#CUI': 'xref_id'}) + empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row + if empty_cols: + df = df.drop(columns=empty_cols) + if add_prefixes: + df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id) + return df