diff --git a/.github/workflows/buid_and_release.yml b/.github/workflows/buid_and_release.yml index 747c4d1..fd57f5f 100644 --- a/.github/workflows/buid_and_release.yml +++ b/.github/workflows/buid_and_release.yml @@ -40,4 +40,4 @@ jobs: files: | omim.owl omim.sssom.tsv - mondo_genes.csv + mondo-omim-genes.robot.tsv diff --git a/.gitignore b/.gitignore index 00a176d..7475617 100644 --- a/.gitignore +++ b/.gitignore @@ -26,12 +26,8 @@ allelicVariants.txt allelicVariants.tsv # Outputs -omim.ttl -omim.sssom.tsv -omim.sssom.log.txt -omim.json -*.sssom.owl -mondo_exactmatch_omim.sssom.tsv -mondo_exactmatch_omimps.sssom.tsv -omim.owl -mondo_genes.csv +/*.json +/*.owl +/*.tsv +/*.ttl +/*.txt diff --git a/README.md b/README.md index 48f105d..05bbaae 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ OMIM stands for "Online Mendelian Inheritance in Man", and is an online catalog of human genes and genetic disorders. The official site is: https://omim.org/ This purpose of this repository is for data transformations for ingest into Mondo. Mainly, -it is for generating an `omim.ttl` file. +it is for generating an `omim.ttl` and other release artefacts. Disclaimer: This repository and its created data artefacts are unnofficial. For official, up-to-date OMIM data, please visit [omim.org](https://omim.org). @@ -31,10 +31,10 @@ you get an error related to this when installing, ignore it, as it is does not seem to be needed to run any of the tools. If however you do get a `psutil` error when running anything, please let us know by [creating an issue](https://github.com/monarch-initiative/omim/issues/new). -## Running & creating `omim.ttl` -Run: `make all` +## Running & creating release +Run: `sh run.sh make all` -Running this will create a new `omim.ttl` file in the root directory. +Running this will create new release artefacts in the root directory. You can also run `make build` or `python -m omim2obo`. These are all the same command. This will download files from omim.org and run the build. @@ -44,8 +44,11 @@ If there's an issue downloading the files, or you are offline, or you just want to use the cache anyway, you can pass the `--use-cache` flag. ## Additional tools +
Details +

+ ### Get PMIDs used for OMIM codes from `omim.ttl` -Command: `make get-pmids` +Command: `sh run.sh make get-pmids` ### OMIM Code Web Scraper Currently, the only feature is `get_codes_by_yyyy_mm`, which returns a list of @@ -86,3 +89,7 @@ from omim2obo.omim_code_scraper import get_codes_by_yyyy_mm code_tuples = get_codes_by_yyyy_mm('2021/05') ``` + + +

+
diff --git a/makefile b/makefile index 11f3af7..04c3f83 100644 --- a/makefile +++ b/makefile @@ -2,7 +2,7 @@ # MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------ -all: omim.ttl omim.sssom.tsv omim.owl mondo_genes.csv +all: omim.ttl omim.sssom.tsv omim.owl mondo-omim-genes.robot.tsv # build: Create new omim.ttl omim.ttl: @@ -35,8 +35,13 @@ omim.owl: omim.ttl mondo_exactmatch_omim.sssom.owl mondo_exactmatch_omimps.sssom query --update sparql/hgnc_links.ru \ convert -f ofn -o $@ -mondo_genes.csv: omim.owl - robot query -i omim.owl --query sparql/mondo_genes.sparql $@ +# Create a TSV of relational information for gene and disease classes +mondo-omim-genes.tsv: omim.owl + robot query -i omim.owl --query sparql/mondo-omim-genes.sparql $@ + +# Create a TSV of relational information for gene and disease classes, as a ROBOT template +mondo-omim-genes.robot.tsv: mondo-omim-genes.tsv + python -m omim2obo.mondo_omim_genes_robot_tsv --inpath $< --outpath $@ cleanup: @rm -f omim.json diff --git a/omim2obo/mondo_omim_genes_robot_tsv.py b/omim2obo/mondo_omim_genes_robot_tsv.py new file mode 100644 index 0000000..9f1af93 --- /dev/null +++ b/omim2obo/mondo_omim_genes_robot_tsv.py @@ -0,0 +1,64 @@ +"""Create: ROBOT template of Mondo and OMIM gene relations: relational information for gene and disease classes""" +from argparse import ArgumentParser +from pathlib import Path +from typing import Dict, Union + +import pandas as pd + +from omim2obo.utils.utils import remove_angle_brackets + + +ROBOT_SUBHEADER = { + 'mondo_id': 'ID', + 'hgnc_id': "SC 'has material basis in germline mutation in' some %", + 'omim_disease_xref': '>A oboInOwl:source', + 'omim_gene': '', +} + + +def mondo_omim_genes_robot_tsv(inpath: Union[Path, str], outpath: Union[Path, str]) -> pd.DataFrame: + """Create: ROBOT template of Mondo and OMIM gene relations""" + df = pd.read_csv(inpath, sep='\t') + + # Remove the first character, a question mark (?), from each field in the header; an artefact of the SPARQL query. + df.rename(columns={col: col[1:] for col in df.columns if col.startswith('?')}, inplace=True) + + # Remove < and > characters from specified columns + uri_cols = ['mondo_id', 'hgnc_id', 'omim_gene'] + for col in uri_cols: + df[col] = remove_angle_brackets(list(df[col])) + + # Format col order + df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'omim_gene']] + + # Sort + df = df.sort_values(by=['mondo_id', 'hgnc_id', 'omim_gene', 'omim_disease_xref']) + + # Remove cases where >1 gene association + # - These indicate non-causal relationships, which we don't care about. + df = df[~df['omim_disease_xref'].duplicated(keep=False)] + + # Insert ROBOT subheader + df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df]) + + df.to_csv(outpath, sep='\t', index=False) + return pd.DataFrame() + + +def cli(): + """Command line interface.""" + parser = ArgumentParser( + prog='mondo-genes-robot-tsv', + description='Create a ROBOT template TSV of relational information for gene and disease classes') + parser.add_argument( + '-i', '--inpath', required=True, + help='Path to file with such relational information, but not yet formatted as a ROBOT template.') + parser.add_argument( + '-o', '--outpath', required=True, + help='Path to save output.') + d: Dict = vars(parser.parse_args()) + mondo_omim_genes_robot_tsv(**d) + + +if __name__ == '__main__': + cli() diff --git a/omim2obo/utils/utils.py b/omim2obo/utils/utils.py new file mode 100644 index 0000000..abd119f --- /dev/null +++ b/omim2obo/utils/utils.py @@ -0,0 +1,16 @@ +"""Misc utilities""" +from typing import List, Union + + +# todo: also in mondo-ingest. Refactor into mondolib: https://github.com/monarch-initiative/mondolib/issues/13 +def remove_angle_brackets(uris: Union[str, List[str]]) -> Union[str, List[str]]: + """Remove angle brackets from URIs, e.g.: + --> https://omim.org/entry/100050""" + str_input = isinstance(uris, str) + uris = [uris] if str_input else uris + uris2 = [] + for x in uris: + x = x[1:] if x.startswith('<') else x + x = x[:-1] if x.endswith('>') else x + uris2.append(x) + return uris2[0] if str_input else uris2 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..1f6db72 --- /dev/null +++ b/run.sh @@ -0,0 +1,85 @@ +#!/bin/sh +# Wrapper script for docker. +# +# This is used primarily for wrapping the GNU Make workflow. +# Instead of typing "make TARGET", type "./run.sh make TARGET". +# This will run the make workflow within a docker container. +# +# The assumption is that you are working in the src/ontology folder; +# we therefore map the whole repo (../..) to a docker volume. +# +# To use singularity instead of docker, please issue +# export USE_SINGULARITY= +# before running this script. +# +# See README-editors.md for more details. + +if [ -f run.sh.conf ]; then + . ./run.sh.conf +fi + +# Look for a GitHub token +if [ -n "$GH_TOKEN" ]; then + : +elif [ -f ../../.github/token.txt ]; then + GH_TOKEN=$(cat ../../.github/token.txt) +elif [ -f $XDG_CONFIG_HOME/ontology-development-kit/github/token ]; then + GH_TOKEN=$(cat $XDG_CONFIG_HOME/ontology-development-kit/github/token) +elif [ -f "$HOME/Library/Application Support/ontology-development-kit/github/token" ]; then + GH_TOKEN=$(cat "$HOME/Library/Application Support/ontology-development-kit/github/token") +fi + +ODK_IMAGE=${ODK_IMAGE:-odkfull} +TAG_IN_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $2 }') +if [ -n "$TAG_IN_IMAGE" ]; then + # Override ODK_TAG env var if IMAGE already includes a tag + ODK_TAG=$TAG_IN_IMAGE + ODK_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $1 }') +fi +ODK_TAG=${ODK_TAG:-v1.4.3} +ODK_JAVA_OPTS=${ODK_JAVA_OPTS:--Xmx20G} +ODK_DEBUG=${ODK_DEBUG:-no} + +# Convert OWLAPI_* environment variables to the OWLAPI as Java options +# See http://owlcs.github.io/owlapi/apidocs_4/org/semanticweb/owlapi/model/parameters/ConfigurationOptions.html +# for a list of allowed options +OWLAPI_OPTIONS_NAMESPACE=org.semanticweb.owlapi.model.parameters.ConfigurationOptions +for owlapi_var in $(env | sed -n s/^OWLAPI_//p) ; do + ODK_JAVA_OPTS="$ODK_JAVA_OPTS -D$OWLAPI_OPTIONS_NAMESPACE.${owlapi_var%=*}=${owlapi_var#*=}" +done + +TIMECMD= +if [ x$ODK_DEBUG = xyes ]; then + # If you wish to change the format string, take care of using + # non-breaking spaces (U+00A0) instead of normal spaces, to + # prevent the shell from tokenizing the format string. + echo "Running ${IMAGE} with ${ODK_JAVA_OPTS} of memory for ROBOT and Java-based pipeline steps." + TIMECMD="/usr/bin/time -f ### DEBUG STATS ###\nElapsed time: %E\nPeak memory: %M kb" +fi + +VOLUME_BIND=$PWD:/work +WORK_DIR=/work + +if [ -n "$ODK_BINDS" ]; then + VOLUME_BIND="$VOLUME_BIND,$ODK_BINDS" +fi + +if [ -n "$USE_SINGULARITY" ]; then + + singularity exec --cleanenv $ODK_SINGULARITY_OPTIONS \ + --env "ROBOT_JAVA_ARGS=$ODK_JAVA_OPTS,JAVA_OPTS=$ODK_JAVA_OPTS" \ + --bind $VOLUME_BIND \ + -W $WORK_DIR \ + docker://obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@" +else + BIND_OPTIONS="-v $(echo $VOLUME_BIND | sed 's/,/ -v /')" + docker run $ODK_DOCKER_OPTIONS $BIND_OPTIONS -w $WORK_DIR \ + -e ROBOT_JAVA_ARGS="$ODK_JAVA_OPTS" -e JAVA_OPTS="$ODK_JAVA_OPTS" \ + --rm -ti obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@" +fi + +case "$@" in +*update_repo*|*release*) + echo "Please remember to update your ODK image from time to time: https://oboacademy.github.io/obook/howto/odk-update/." + ;; +esac \ No newline at end of file diff --git a/sparql/mondo_genes.sparql b/sparql/mondo-omim-genes.sparql similarity index 100% rename from sparql/mondo_genes.sparql rename to sparql/mondo-omim-genes.sparql