From 47d744cc89f80948b3348363d7879c8715842531 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Thu, 6 Jun 2024 16:46:55 -0400 Subject: [PATCH 1/3] HGNC ROBOT template - Rename: mondo_genes.csv --> mondo_genes.robot.tsv - Update: Change from CSV to TSV - Update: Set a ROBOT sub-header - Update: remove < > around URIs - Update: remove ?'s at start of col names - Update: insert source_code col, w/ values: MONDO:OMIM General: - Add: run.sh: For running ODK. And updated README.md w/ docs about that. - Update: README.md: Put some less important stuff in
--- .github/workflows/buid_and_release.yml | 2 +- .gitignore | 2 +- README.md | 17 ++++-- makefile | 14 ++++- run.sh | 85 ++++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 9 deletions(-) create mode 100644 run.sh diff --git a/.github/workflows/buid_and_release.yml b/.github/workflows/buid_and_release.yml index 747c4d1..c2a4638 100644 --- a/.github/workflows/buid_and_release.yml +++ b/.github/workflows/buid_and_release.yml @@ -40,4 +40,4 @@ jobs: files: | omim.owl omim.sssom.tsv - mondo_genes.csv + mondo_genes.robot.tsv diff --git a/.gitignore b/.gitignore index 00a176d..ae2ebb0 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,4 @@ omim.json mondo_exactmatch_omim.sssom.tsv mondo_exactmatch_omimps.sssom.tsv omim.owl -mondo_genes.csv +mondo_genes.robot.tsv diff --git a/README.md b/README.md index 48f105d..05bbaae 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ OMIM stands for "Online Mendelian Inheritance in Man", and is an online catalog of human genes and genetic disorders. The official site is: https://omim.org/ This purpose of this repository is for data transformations for ingest into Mondo. Mainly, -it is for generating an `omim.ttl` file. +it is for generating an `omim.ttl` and other release artefacts. Disclaimer: This repository and its created data artefacts are unnofficial. For official, up-to-date OMIM data, please visit [omim.org](https://omim.org). @@ -31,10 +31,10 @@ you get an error related to this when installing, ignore it, as it is does not seem to be needed to run any of the tools. If however you do get a `psutil` error when running anything, please let us know by [creating an issue](https://github.com/monarch-initiative/omim/issues/new). -## Running & creating `omim.ttl` -Run: `make all` +## Running & creating release +Run: `sh run.sh make all` -Running this will create a new `omim.ttl` file in the root directory. +Running this will create new release artefacts in the root directory. You can also run `make build` or `python -m omim2obo`. These are all the same command. This will download files from omim.org and run the build. @@ -44,8 +44,11 @@ If there's an issue downloading the files, or you are offline, or you just want to use the cache anyway, you can pass the `--use-cache` flag. ## Additional tools +
Details +

+ ### Get PMIDs used for OMIM codes from `omim.ttl` -Command: `make get-pmids` +Command: `sh run.sh make get-pmids` ### OMIM Code Web Scraper Currently, the only feature is `get_codes_by_yyyy_mm`, which returns a list of @@ -86,3 +89,7 @@ from omim2obo.omim_code_scraper import get_codes_by_yyyy_mm code_tuples = get_codes_by_yyyy_mm('2021/05') ``` + + +

+
diff --git a/makefile b/makefile index 11f3af7..eb97c04 100644 --- a/makefile +++ b/makefile @@ -2,7 +2,7 @@ # MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------ -all: omim.ttl omim.sssom.tsv omim.owl mondo_genes.csv +all: omim.ttl omim.sssom.tsv omim.owl mondo_genes.robot.tsv # build: Create new omim.ttl omim.ttl: @@ -35,8 +35,18 @@ omim.owl: omim.ttl mondo_exactmatch_omim.sssom.owl mondo_exactmatch_omimps.sssom query --update sparql/hgnc_links.ru \ convert -f ofn -o $@ -mondo_genes.csv: omim.owl +mondo_genes.robot.tsv: omim.owl + # Create a TSV of relational information for gene and disease classes robot query -i omim.owl --query sparql/mondo_genes.sparql $@ + # Insert the source_code column as the second to last column + awk 'BEGIN {FS=OFS="\t"} {if (NR==1) {$$(NF+1)=$$(NF); $$(NF-1)="?source_code";} else {$$(NF+1)=$$(NF); $$(NF-1)="MONDO:OMIM";}} 1' $@ > temp_file && mv temp_file $@ + # Remove the first character, a question mark (?), from each field in the header. This is an artefact of the SPARQL query. + awk 'BEGIN {FS=OFS="\t"} NR==1 {for (i=1; i<=NF; i++) $$i=substr($$i, 2)} {print}' $@ > temp_file && mv temp_file $@ + # Remove < and > characters from specified columns + awk 'BEGIN {FS=OFS="\t"} NR>1 {gsub(/^<|>$$/, "", $$1); gsub(/^<|>$$/, "", $$2); gsub(/^<|>$$/, "", $$5)} {print}' $@ > temp_file && mv temp_file $@ + # Insert ROBOT subheader + robot_subheader="ID\tSC 'has material basis in germline mutation in' some %\t>A oboInOwl:source\t>A oboInOwl:source\t" && \ + sed 1a"$$robot_subheader" $@ > temp_file && mv temp_file $@ cleanup: @rm -f omim.json diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..1f6db72 --- /dev/null +++ b/run.sh @@ -0,0 +1,85 @@ +#!/bin/sh +# Wrapper script for docker. +# +# This is used primarily for wrapping the GNU Make workflow. +# Instead of typing "make TARGET", type "./run.sh make TARGET". +# This will run the make workflow within a docker container. +# +# The assumption is that you are working in the src/ontology folder; +# we therefore map the whole repo (../..) to a docker volume. +# +# To use singularity instead of docker, please issue +# export USE_SINGULARITY= +# before running this script. +# +# See README-editors.md for more details. + +if [ -f run.sh.conf ]; then + . ./run.sh.conf +fi + +# Look for a GitHub token +if [ -n "$GH_TOKEN" ]; then + : +elif [ -f ../../.github/token.txt ]; then + GH_TOKEN=$(cat ../../.github/token.txt) +elif [ -f $XDG_CONFIG_HOME/ontology-development-kit/github/token ]; then + GH_TOKEN=$(cat $XDG_CONFIG_HOME/ontology-development-kit/github/token) +elif [ -f "$HOME/Library/Application Support/ontology-development-kit/github/token" ]; then + GH_TOKEN=$(cat "$HOME/Library/Application Support/ontology-development-kit/github/token") +fi + +ODK_IMAGE=${ODK_IMAGE:-odkfull} +TAG_IN_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $2 }') +if [ -n "$TAG_IN_IMAGE" ]; then + # Override ODK_TAG env var if IMAGE already includes a tag + ODK_TAG=$TAG_IN_IMAGE + ODK_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $1 }') +fi +ODK_TAG=${ODK_TAG:-v1.4.3} +ODK_JAVA_OPTS=${ODK_JAVA_OPTS:--Xmx20G} +ODK_DEBUG=${ODK_DEBUG:-no} + +# Convert OWLAPI_* environment variables to the OWLAPI as Java options +# See http://owlcs.github.io/owlapi/apidocs_4/org/semanticweb/owlapi/model/parameters/ConfigurationOptions.html +# for a list of allowed options +OWLAPI_OPTIONS_NAMESPACE=org.semanticweb.owlapi.model.parameters.ConfigurationOptions +for owlapi_var in $(env | sed -n s/^OWLAPI_//p) ; do + ODK_JAVA_OPTS="$ODK_JAVA_OPTS -D$OWLAPI_OPTIONS_NAMESPACE.${owlapi_var%=*}=${owlapi_var#*=}" +done + +TIMECMD= +if [ x$ODK_DEBUG = xyes ]; then + # If you wish to change the format string, take care of using + # non-breaking spaces (U+00A0) instead of normal spaces, to + # prevent the shell from tokenizing the format string. + echo "Running ${IMAGE} with ${ODK_JAVA_OPTS} of memory for ROBOT and Java-based pipeline steps." + TIMECMD="/usr/bin/time -f ### DEBUG STATS ###\nElapsed time: %E\nPeak memory: %M kb" +fi + +VOLUME_BIND=$PWD:/work +WORK_DIR=/work + +if [ -n "$ODK_BINDS" ]; then + VOLUME_BIND="$VOLUME_BIND,$ODK_BINDS" +fi + +if [ -n "$USE_SINGULARITY" ]; then + + singularity exec --cleanenv $ODK_SINGULARITY_OPTIONS \ + --env "ROBOT_JAVA_ARGS=$ODK_JAVA_OPTS,JAVA_OPTS=$ODK_JAVA_OPTS" \ + --bind $VOLUME_BIND \ + -W $WORK_DIR \ + docker://obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@" +else + BIND_OPTIONS="-v $(echo $VOLUME_BIND | sed 's/,/ -v /')" + docker run $ODK_DOCKER_OPTIONS $BIND_OPTIONS -w $WORK_DIR \ + -e ROBOT_JAVA_ARGS="$ODK_JAVA_OPTS" -e JAVA_OPTS="$ODK_JAVA_OPTS" \ + --rm -ti obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@" +fi + +case "$@" in +*update_repo*|*release*) + echo "Please remember to update your ODK image from time to time: https://oboacademy.github.io/obook/howto/odk-update/." + ;; +esac \ No newline at end of file From cb615e8f94bb40b2a7cf274eafde0904e9845244 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Tue, 11 Jun 2024 18:21:35 -0400 Subject: [PATCH 2/3] HGNC ROBOT template - Update: Refactor method to do this from ShellScript / awk to Python / pandas. - Update: Now sorts columns General - Update: .gitignore: Simplified ignores for files at root. - Add: Utility function to handle < > around URIs --- .github/workflows/buid_and_release.yml | 2 +- .gitignore | 14 ++-- makefile | 21 +++--- omim2obo/mondo_omim_genes_robot_tsv.py | 64 +++++++++++++++++++ omim2obo/utils/utils.py | 16 +++++ ...o_genes.sparql => mondo-omim-genes.sparql} | 0 6 files changed, 94 insertions(+), 23 deletions(-) create mode 100644 omim2obo/mondo_omim_genes_robot_tsv.py create mode 100644 omim2obo/utils/utils.py rename sparql/{mondo_genes.sparql => mondo-omim-genes.sparql} (100%) diff --git a/.github/workflows/buid_and_release.yml b/.github/workflows/buid_and_release.yml index c2a4638..fd57f5f 100644 --- a/.github/workflows/buid_and_release.yml +++ b/.github/workflows/buid_and_release.yml @@ -40,4 +40,4 @@ jobs: files: | omim.owl omim.sssom.tsv - mondo_genes.robot.tsv + mondo-omim-genes.robot.tsv diff --git a/.gitignore b/.gitignore index ae2ebb0..7475617 100644 --- a/.gitignore +++ b/.gitignore @@ -26,12 +26,8 @@ allelicVariants.txt allelicVariants.tsv # Outputs -omim.ttl -omim.sssom.tsv -omim.sssom.log.txt -omim.json -*.sssom.owl -mondo_exactmatch_omim.sssom.tsv -mondo_exactmatch_omimps.sssom.tsv -omim.owl -mondo_genes.robot.tsv +/*.json +/*.owl +/*.tsv +/*.ttl +/*.txt diff --git a/makefile b/makefile index eb97c04..04c3f83 100644 --- a/makefile +++ b/makefile @@ -2,7 +2,7 @@ # MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------ -all: omim.ttl omim.sssom.tsv omim.owl mondo_genes.robot.tsv +all: omim.ttl omim.sssom.tsv omim.owl mondo-omim-genes.robot.tsv # build: Create new omim.ttl omim.ttl: @@ -35,18 +35,13 @@ omim.owl: omim.ttl mondo_exactmatch_omim.sssom.owl mondo_exactmatch_omimps.sssom query --update sparql/hgnc_links.ru \ convert -f ofn -o $@ -mondo_genes.robot.tsv: omim.owl - # Create a TSV of relational information for gene and disease classes - robot query -i omim.owl --query sparql/mondo_genes.sparql $@ - # Insert the source_code column as the second to last column - awk 'BEGIN {FS=OFS="\t"} {if (NR==1) {$$(NF+1)=$$(NF); $$(NF-1)="?source_code";} else {$$(NF+1)=$$(NF); $$(NF-1)="MONDO:OMIM";}} 1' $@ > temp_file && mv temp_file $@ - # Remove the first character, a question mark (?), from each field in the header. This is an artefact of the SPARQL query. - awk 'BEGIN {FS=OFS="\t"} NR==1 {for (i=1; i<=NF; i++) $$i=substr($$i, 2)} {print}' $@ > temp_file && mv temp_file $@ - # Remove < and > characters from specified columns - awk 'BEGIN {FS=OFS="\t"} NR>1 {gsub(/^<|>$$/, "", $$1); gsub(/^<|>$$/, "", $$2); gsub(/^<|>$$/, "", $$5)} {print}' $@ > temp_file && mv temp_file $@ - # Insert ROBOT subheader - robot_subheader="ID\tSC 'has material basis in germline mutation in' some %\t>A oboInOwl:source\t>A oboInOwl:source\t" && \ - sed 1a"$$robot_subheader" $@ > temp_file && mv temp_file $@ +# Create a TSV of relational information for gene and disease classes +mondo-omim-genes.tsv: omim.owl + robot query -i omim.owl --query sparql/mondo-omim-genes.sparql $@ + +# Create a TSV of relational information for gene and disease classes, as a ROBOT template +mondo-omim-genes.robot.tsv: mondo-omim-genes.tsv + python -m omim2obo.mondo_omim_genes_robot_tsv --inpath $< --outpath $@ cleanup: @rm -f omim.json diff --git a/omim2obo/mondo_omim_genes_robot_tsv.py b/omim2obo/mondo_omim_genes_robot_tsv.py new file mode 100644 index 0000000..3930a10 --- /dev/null +++ b/omim2obo/mondo_omim_genes_robot_tsv.py @@ -0,0 +1,64 @@ +"""Create: ROBOT template of Mondo and OMIM gene relations: relational information for gene and disease classes""" +from argparse import ArgumentParser +from pathlib import Path +from typing import Dict, Union + +import pandas as pd + +from omim2obo.utils.utils import remove_angle_brackets + + +ROBOT_SUBHEADER = { + 'mondo_id': 'ID', + 'hgnc_id': "SC 'has material basis in germline mutation in' some %", + 'omim_disease_xref': '>A oboInOwl:source', + 'source_code': '>A oboInOwl:source', + 'omim_gene': '', +} + + +def mondo_omim_genes_robot_tsv(inpath: Union[Path, str], outpath: Union[Path, str]) -> pd.DataFrame: + """Create: ROBOT template of Mondo and OMIM gene relations""" + df = pd.read_csv(inpath, sep='\t') + + # Remove the first character, a question mark (?), from each field in the header; an artefact of the SPARQL query. + df.rename(columns={col: col[1:] for col in df.columns if col.startswith('?')}, inplace=True) + + # Add source_code column + df['source_code'] = 'MONDO:OMIM' + + # Remove < and > characters from specified columns + uri_cols = ['mondo_id', 'hgnc_id', 'omim_gene'] + for col in uri_cols: + df[col] = remove_angle_brackets(list(df[col])) + + # Insert ROBOT subheader + df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df]) + + # Format col order + df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'source_code', 'omim_gene']] + + # Sort + df = df.sort_values(by=['mondo_id', 'hgnc_id', 'omim_gene', 'omim_disease_xref']) + + df.to_csv(outpath, sep='\t', index=False) + return pd.DataFrame() + + +def cli(): + """Command line interface.""" + parser = ArgumentParser( + prog='mondo-genes-robot-tsv', + description='Create a ROBOT template TSV of relational information for gene and disease classes') + parser.add_argument( + '-i', '--inpath', required=True, + help='Path to file with such relational information, but not yet formatted as a ROBOT template.') + parser.add_argument( + '-o', '--outpath', required=True, + help='Path to save output.') + d: Dict = vars(parser.parse_args()) + mondo_omim_genes_robot_tsv(**d) + + +if __name__ == '__main__': + cli() diff --git a/omim2obo/utils/utils.py b/omim2obo/utils/utils.py new file mode 100644 index 0000000..abd119f --- /dev/null +++ b/omim2obo/utils/utils.py @@ -0,0 +1,16 @@ +"""Misc utilities""" +from typing import List, Union + + +# todo: also in mondo-ingest. Refactor into mondolib: https://github.com/monarch-initiative/mondolib/issues/13 +def remove_angle_brackets(uris: Union[str, List[str]]) -> Union[str, List[str]]: + """Remove angle brackets from URIs, e.g.: + --> https://omim.org/entry/100050""" + str_input = isinstance(uris, str) + uris = [uris] if str_input else uris + uris2 = [] + for x in uris: + x = x[1:] if x.startswith('<') else x + x = x[:-1] if x.endswith('>') else x + uris2.append(x) + return uris2[0] if str_input else uris2 diff --git a/sparql/mondo_genes.sparql b/sparql/mondo-omim-genes.sparql similarity index 100% rename from sparql/mondo_genes.sparql rename to sparql/mondo-omim-genes.sparql From 6b3a48de8798ea6240ca5a0a545c4eb0654d6f82 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Tue, 18 Jun 2024 18:06:40 -0400 Subject: [PATCH 3/3] HGNC ROBOT template - Delete: source_code column (w/ values: MONDO:OMIM) - Bug fix: No longer adding exact match gene annotations if >1 gene associated with MIM. --- omim2obo/mondo_omim_genes_robot_tsv.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/omim2obo/mondo_omim_genes_robot_tsv.py b/omim2obo/mondo_omim_genes_robot_tsv.py index 3930a10..9f1af93 100644 --- a/omim2obo/mondo_omim_genes_robot_tsv.py +++ b/omim2obo/mondo_omim_genes_robot_tsv.py @@ -12,7 +12,6 @@ 'mondo_id': 'ID', 'hgnc_id': "SC 'has material basis in germline mutation in' some %", 'omim_disease_xref': '>A oboInOwl:source', - 'source_code': '>A oboInOwl:source', 'omim_gene': '', } @@ -24,23 +23,24 @@ def mondo_omim_genes_robot_tsv(inpath: Union[Path, str], outpath: Union[Path, st # Remove the first character, a question mark (?), from each field in the header; an artefact of the SPARQL query. df.rename(columns={col: col[1:] for col in df.columns if col.startswith('?')}, inplace=True) - # Add source_code column - df['source_code'] = 'MONDO:OMIM' - # Remove < and > characters from specified columns uri_cols = ['mondo_id', 'hgnc_id', 'omim_gene'] for col in uri_cols: df[col] = remove_angle_brackets(list(df[col])) - # Insert ROBOT subheader - df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df]) - # Format col order - df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'source_code', 'omim_gene']] + df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'omim_gene']] # Sort df = df.sort_values(by=['mondo_id', 'hgnc_id', 'omim_gene', 'omim_disease_xref']) + # Remove cases where >1 gene association + # - These indicate non-causal relationships, which we don't care about. + df = df[~df['omim_disease_xref'].duplicated(keep=False)] + + # Insert ROBOT subheader + df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df]) + df.to_csv(outpath, sep='\t', index=False) return pd.DataFrame()