diff --git a/.github/workflows/buid_and_release.yml b/.github/workflows/buid_and_release.yml index f7ec3ee..02ad562 100644 --- a/.github/workflows/buid_and_release.yml +++ b/.github/workflows/buid_and_release.yml @@ -39,5 +39,5 @@ jobs: output/release/medgen.obo output/release/medgen-disease-extract.obo output/release/medgen-xrefs.robot.template.tsv - output/release/hpo-umls.sssom.tsv + output/release/umls-hpo.sssom.tsv output/release/hpo-mesh.sssom.tsv diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml index daeb4d9..f93c10d 100644 --- a/config/medgen.sssom-metadata.yml +++ b/config/medgen.sssom-metadata.yml @@ -1,4 +1,4 @@ -creator_id: 0000-0002-2906-7319 +creator_id: orcid:0000-0002-2906-7319 curie_map: GTR: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/ HP: http://purl.obolibrary.org/obo/HP_ @@ -9,6 +9,7 @@ curie_map: NCIT: http://purl.obolibrary.org/obo/NCIT_ OMIM: https://omim.org/entry/ Orphanet: http://www.orpha.net/ORDO/Orphanet_ + orcid: https://orcid.org/ SCTID: http://identifiers.org/snomedct/ UMLS: http://purl.obolibrary.org/obo/UMLS_ oboInOwl: http://www.geneontology.org/formats/oboInOwl# diff --git a/makefile b/makefile index 8090acb..73ea939 100644 --- a/makefile +++ b/makefile @@ -85,9 +85,11 @@ medgen-disease-extract.owl: medgen-disease-extract.obo # #medgen.sssom.tsv: medgen.obographs.json # sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@ -sssom: hpo-umls.sssom.tsv +sssom: umls-hpo.sssom.tsv + sssom validate umls-hpo.sssom.tsv + sssom validate hpo-mesh.sssom.tsv -hpo-umls.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt +umls-hpo.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt python src/create_sssom.py --input-mappings $< --input-sssom-config config/medgen.sssom-metadata.yml # ---------------------------------------- diff --git a/requirements-unlocked.txt b/requirements-unlocked.txt index b052e87..5f3fb30 100644 --- a/requirements-unlocked.txt +++ b/requirements-unlocked.txt @@ -1,2 +1,3 @@ pandas pyyaml +sssom diff --git a/requirements.txt b/requirements.txt index 6f194b3..7b8bc1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,61 @@ +annotated-types==0.6.0 +attrs==23.2.0 +certifi==2024.2.2 +charset-normalizer==3.3.2 +click==8.1.7 +curies==0.7.9 +Deprecated==1.2.14 +deprecation==2.1.0 distlib==0.3.6 +exceptiongroup==1.2.0 filelock==3.9.0 +hbreader==0.9.1 +idna==3.6 +importlib_resources==6.4.0 +iniconfig==2.0.0 +isodate==0.6.1 +json-flattener==0.1.9 +jsonasobj2==1.0.4 +jsonschema==4.21.1 +jsonschema-specifications==2023.12.1 +linkml-runtime==1.7.5 +networkx==3.3 numpy==1.25.1 +packaging==24.0 pandas==2.0.3 +pansql==0.0.1 pbr==5.11.1 platformdirs==3.1.0 +pluggy==1.4.0 +prefixcommons==0.1.12 +prefixmaps==0.2.3 +pydantic==2.6.4 +pydantic_core==2.16.3 +pyparsing==3.1.2 +pytest==8.1.1 +pytest-logging==2015.11.4 python-dateutil==2.8.2 +PyTrie==0.4.0 pytz==2023.3 PyYAML==6.0.1 +rdflib==7.0.0 +referencing==0.34.0 +requests==2.31.0 +rpds-py==0.18.0 +scipy==1.13.0 six==1.16.0 +sortedcontainers==2.4.0 +SPARQLWrapper==2.0.0 +SQLAlchemy==2.0.29 +sssom==0.4.6 +sssom-schema==0.15.2 stevedore==5.0.0 +tomli==2.0.1 +typing_extensions==4.11.0 tzdata==2023.3 +urllib3==2.2.1 +validators==0.28.0 virtualenv==20.20.0 virtualenv-clone==0.5.7 +virtualenvwrapper==4.8.4 +wrapt==1.16.0 diff --git a/src/create_sssom.py b/src/create_sssom.py index 3a3910d..c4c4dcc 100644 --- a/src/create_sssom.py +++ b/src/create_sssom.py @@ -12,7 +12,7 @@ CONFIG_DIR = PROJECT_DIR / "config" INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt") INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml") -OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "hpo-umls.sssom.tsv") +OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "umls-hpo.sssom.tsv") OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv") @@ -21,10 +21,17 @@ def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame: return df[df['source'] == source][['subject_id', 'subject_label', 'predicate_id', 'object_id']] -def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG): - """Create SSSOM outputs""" +def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CONFIG, hpo_match_only_with_umls=True): + """Create SSSOM outputs + + :param hpo_match_only_with_umls: If True, only create SSSOM outputs for HPO mappings that have UMLS mappings, and + will filter out other matches. This is purely edge case handling. As of 2024/04/06, 100% of the mappings were UMLS + anyway.""" # SSSOM 1: HPO<->UMLS df_hpo_umls = get_mapping_set(input_mappings, ['HPO'], add_prefixes=True) + if hpo_match_only_with_umls: + df_hpo_umls = df_hpo_umls[df_hpo_umls['subject_id'].str.startswith('UMLS:')] + df_hpo_umls['mapping_justification'] = 'semapv:ManualMappingCuration' write_sssom(df_hpo_umls, input_sssom_config, OUTPUT_FILE_HPO_UMLS) # SSSOM 2: HPO<->MeSH @@ -45,7 +52,10 @@ def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CO df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x) # todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to # move the col removals below (umls) to above - write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-no-matches.sssom.tsv')) + # - add mapping_justification + df_hpo_mesh['mapping_justification'] = 'semapv:ManualMappingCuration' + write_sssom(df_hpo_mesh, input_sssom_config, + OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-non-matches-included.sssom.tsv')) # -- filter non-matches & drop unneeded cols df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[ x for x in df_hpo_mesh.columns if not x.startswith('umls')]] diff --git a/src/utils.py b/src/utils.py index 5e7e5c5..fdf31b3 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,10 +1,12 @@ """Utils""" -import os from pathlib import Path -from typing import List, Set, Union +from typing import Dict, List, Union +import curies import pandas as pd import yaml +from sssom import MappingSetDataFrame +from sssom.writers import write_table def add_prefixes_to_plain_id(x: str) -> str: @@ -21,53 +23,14 @@ def add_prefixes_to_plain_id(x: str) -> str: else f'MEDGEN:{x}' -def find_prefixes_in_mapping_set(source_df: pd.DataFrame) -> Set[str]: - """Find prefixes in mapping set""" - df = source_df.copy() - cols_with_prefixes = ['subject_id', 'object_id', 'predicate_id'] - prefixes = set() - for col in cols_with_prefixes: - col2 = col.replace('id', 'prefix') - df[col2] = df[col].apply(lambda x: x.split(':')[0] - if isinstance(x, str) else x) # handles nan - prefixes.update(set(df[col2].to_list())) - return prefixes - - def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]): - """Writes a SSSOM file with commented metadata at the top of the file. - - Filters only prefxes in curie_map that exist in the mapping set.""" - temp_filtered_config_path = str(config_path) + '.tmp' - # Load config - config = yaml.safe_load(open(config_path, 'r')) - # Filter curie_map - prefixes: Set[str] = find_prefixes_in_mapping_set(df) - config['curie_map'] = {k: v for k, v in config['curie_map'].items() if k in prefixes} - # Write - with open(temp_filtered_config_path, 'w') as f: - yaml.dump(config, f) - write_tsv_with_comments(df, temp_filtered_config_path, outpath) - os.remove(temp_filtered_config_path) - - -def write_tsv_with_comments(df: pd.DataFrame, comments_file: Union[Path, str], outpath: Union[Path, str]): - """Write a TSV with comments at the top""" - # write metadata - f = open(comments_file, "r") - lines = f.readlines() - f.close() - output_lines = [] - for line in lines: - output_lines.append("# " + line) - metadata_str = ''.join(output_lines) - if os.path.exists(outpath): - os.remove(outpath) - f = open(outpath, 'a') - f.write(metadata_str) - f.close() - # write data - df.to_csv(outpath, index=False, sep='\t', mode='a') + """Writes a SSSOM file""" + with open(config_path, 'r') as yaml_file: + metadata: Dict = yaml.load(yaml_file, Loader=yaml.FullLoader) + converter = curies.Converter.from_prefix_map(metadata['curie_map']) + msdf: MappingSetDataFrame = MappingSetDataFrame(converter=converter, df=df, metadata=metadata) + with open(outpath, 'w') as f: + write_table(msdf, f) # todo: for the SSSOM use case, it is weird to rename #CUI as xref_id. so maybe _get_mapping_set() should either not