diff --git a/mira/dkg/api.py b/mira/dkg/api.py index 21f92d942..97b3c9ebd 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -13,7 +13,7 @@ from mira.dkg.client import AskemEntity, Entity, Relation from mira.dkg.utils import DKG_REFINER_RELS -from mira.dkg.construct import add_resource_to_dkg +from mira.dkg.construct import add_resource_to_dkg, extract_ontology_subtree __all__ = [ "api_blueprint", @@ -360,6 +360,28 @@ def add_relations( for relation in relation_list: request.app.state.client.add_relation(relation) + @api_blueprint.post( + "/add_ontology_subtree", + response_model=None, + tags=["relations"], + ) + def add_ontology_subtree( + request: Request, + curie: str = Query(..., example="ncbitaxon:9871"), + add_subtree: bool = False + ): + """Given a curie, add the entry it corresponds to from its respective + ontology as a node to the DKG. + Can enable the `add_subtree` flag to add all subtree entries.""" + curie = curie.lower() + nodes, edges = extract_ontology_subtree(curie, add_subtree) + entities = [Entity(**node_info) for node_info in nodes] + relations = [Relation(**edge_info) for edge_info in edges] + for entity in entities: + request.app.state.client.add_node(entity) + for relation in relations: + request.app.state.client.add_relation(relation) + @api_blueprint.post( "/add_resources", diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 2b9a54a13..44eb37b13 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -35,12 +35,16 @@ import click import pyobo import pystow +import networkx from bioontologies import obograph -from bioontologies.obograph import Xref from bioregistry import manager from pydantic import BaseModel, Field -from pyobo.struct import part_of +from pyobo.struct import part_of, is_a from pyobo.sources import ontology_resolver +from pyobo.getters import _ensure_ontology_path +from pyobo.api.utils import get_version +from pyobo.utils.path import prefix_directory_join +from obonet import read_obo from tabulate import tabulate from tqdm.auto import tqdm from typing_extensions import Literal @@ -54,7 +58,7 @@ from mira.dkg.physical_constants import get_physical_constant_terms from mira.dkg.constants import EDGE_HEADER, NODE_HEADER from mira.dkg.utils import PREFIXES -from mira.dkg.client import Synonym, Xref +from mira.dkg.models import Synonym, Xref from mira.dkg.resources.cso import get_cso_obo from mira.dkg.resources.geonames import get_geonames_terms from mira.dkg.resources.extract_eiffel_ontology import get_eiffel_ontology_terms @@ -421,6 +425,131 @@ def add_resource_to_dkg(resource_prefix: str): # handle resource names that we don't process return [], [] +def extract_ontology_subtree(curie: str, add_subtree: bool = False): + """Takes in a curie and extracts the information from the + entry in its respective resource ontology to add as a node into the + Epidemiology DKG. + + There is an option to extract all the information from the entries + under the corresponding entry's subtree in its respective ontology. + Relation information is also extracted with this option. + + Running this method for the first time for each specific resource will + take a long time (minutes) as the obo resource file has to be downloaded, + converted to a networkx graph, have their node indices normalized, and + pickled. + + Subsequent runs of this method will take a few seconds as the pickled + graph object has to be loaded. + + Parameters + ---------- + curie : + The curie for the entry that will be added as a node to the + Epidemiology DKG. + add_subtree : + Whether to add all the nodes and relations under the entry's subtree + + Returns + ------- + nodes : List[dict] + A list of node information added to the DKG, where each node is + represented as a dictionary. + edges : List[dict] + A list of edge information added to the DKG, where each edge is + represented as a dictionary. + """ + nodes, edges = [], [] + resource_prefix = curie.split(":")[0] + if resource_prefix == "ncbitaxon": + type = "class" + version = get_version(resource_prefix) + cached_relabeled_obo_graph_path = prefix_directory_join(resource_prefix, + name="relabeled_obo_graph.pkl", + version=version) + if not cached_relabeled_obo_graph_path.exists(): + _, obo_path = _ensure_ontology_path(resource_prefix, force=False, + version=version) + obo_graph = read_obo(obo_path) + relabeled_graph = networkx.relabel_nodes(obo_graph, + lambda node_index: node_index.lower()) + with open(cached_relabeled_obo_graph_path,'wb') as relabeled_graph_file: + pickle.dump(relabeled_graph, relabeled_graph_file) + else: + with open(cached_relabeled_obo_graph_path,'rb') as relabeled_graph_file: + relabeled_graph = pickle.load(relabeled_graph_file) + else: + return nodes, edges + + node = relabeled_graph.nodes.get(curie) + if not node: + return nodes, edges + if not add_subtree: + property_dict = defaultdict(list) + for text in node.get("property_value", []): + k, v = text.split(" ", 1) + property_dict[k].append(v) + nodes.append( + { + "id": curie, + "name": node["name"], + "type": type, + "description": "", + "obsolete": False, + "synonyms": [ + Synonym(value=syn.split("\"")[1], + type="") for syn in + node.get("synonym", []) + ], + "alts": [], + "xrefs": [Xref(id=xref_curie.lower(), type="") + for xref_curie in node["xref"]], + "properties": property_dict + } + ) + return nodes, edges + else: + for node_curie in networkx.ancestors(relabeled_graph, curie) | {curie}: + node_curie = node_curie + node = relabeled_graph.nodes[node_curie] + property_dict = defaultdict(list) + for text in node.get("property_value", []): + k, v = text.split(" ",1) + property_dict[k].append(v) + nodes.append( + { + "id": node_curie, + "name": node["name"], + "type": type, + "description": "", + "obsolete": False, + "synonyms": [ + Synonym(value=syn.split("\"")[1], + type="") for syn in + node.get("synonym", []) + ], + "alts": [], + "xrefs": [Xref(id=xref_curie.lower(), type="") + for xref_curie in node.get("xref", [])], + "properties": property_dict + } + ) + # Don't add relations where the original curie to add is the source + # of an is_a relation. Root nodes won't have an is_a relation. + if node_curie == curie or node["name"] == "root": + continue + edges.append( + { + "source_curie": node_curie, + "target_curie": node["is_a"][0].lower(), + "type": is_a.name.replace(" ","_"), + "pred": is_a.curie, + "source": resource_prefix, + "graph": resource_prefix, + "version": "" + } + ) + return nodes, edges @click.command() @click.option( diff --git a/notebooks/Add_ontology_subtree_demo.ipynb b/notebooks/Add_ontology_subtree_demo.ipynb new file mode 100644 index 000000000..77266ca44 --- /dev/null +++ b/notebooks/Add_ontology_subtree_demo.ipynb @@ -0,0 +1,300 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ef5b3af7-d485-4bd0-b189-8479895528c7", + "metadata": {}, + "source": [ + "## Demostration of the ```add_ontology_subtree``` endpoint\n", + "\n", + "We demonstrate the ability to add any ontology term to the MIRA DKG given its curie. We also provide the option to add all taxonomy terms in the subtree of the specified taxonomy term to add. Currently we only support the addition of **ncbitaxonomy** terms. This demonstration will make use of the white-tailed deer ncbitaxonomy term: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9874&lvl=3&lin=f&keep=1&srchmode=1&unlock." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "7658e870-7018-4bb9-a93d-375cceccf221", + "metadata": {}, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "markdown", + "id": "b00a331d-6983-47a7-a408-ed8bdbcf5528", + "metadata": {}, + "source": [ + "## Define the ncbitaxonomy curie to add.\n", + "We first only add term itself and no terms under its subtree to the DKG." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4f772600-8e3a-43bc-a3eb-47314e5cf6b7", + "metadata": {}, + "outputs": [], + "source": [ + "# The ncbitaxonomy term with identifier 9874 corresponds to \"Odocoileus Virginianus\" which represents white-tailed deer. \n", + "curie = 'ncbitaxon:9874'\n", + "add_subtree = False\n", + "params = {\"curie\": curie, \"add_subtree\": False}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "18c57e07-eb4a-4993-85c4-e63509a38987", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response = requests.post(\"http://mira-epi-dkg-lb-c7b58edea41524e6.elb.us-east-1.amazonaws.com:8771/api/add_ontology_subtree\", params=params)\n", + "response.status_code" + ] + }, + { + "cell_type": "markdown", + "id": "632f5946-c689-4b95-94a8-232bdf72cfbb", + "metadata": {}, + "source": [ + "## Search for the added node in the DKG\n", + "\n", + "We then use the ```search``` endpoint to look for the newly added node that represents white-tailed deer. Looking at the ontological structure of the white-tailed deer term: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Tree&id=9874&lvl=3&lin=f&keep=1&srchmode=1&unlock, we see that there exists a child term in the subtree for white-tailed deer with curie ```ncbitaxon:9875``` that corresponds to Virginia white-tailed deer. We also search for Virginia white-tailed deer to showcase that no subtree terms have been added." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "247695f2-64e6-4de3-95e7-9c81c3f9a052", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "white-tailed deer\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'id': 'ncbitaxon:9874',\n", + " 'name': 'Odocoileus virginianus',\n", + " 'type': 'class',\n", + " 'obsolete': False,\n", + " 'synonyms': [{'value': 'white-tailed deer', 'type': ''}],\n", + " 'xrefs': [{'id': 'gc_id:1', 'type': ''}],\n", + " 'labels': [],\n", + " 'properties': {'has_rank': ['NCBITaxon:species']}}]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search_params = {\n", + " \"q\": \"white-tailed deer\",\n", + " \"limit\": 1,\n", + " \"offset\": 0,\n", + " \"prefixes\": \"ncbitaxon\",\n", + " \"wikidata_fallback\": False\n", + "}\n", + "response = requests.get(\"http://mira-epi-dkg-lb-c7b58edea41524e6.elb.us-east-1.amazonaws.com:8771/api/search\", params=search_params)\n", + "print(response.json()[0][\"synonyms\"][0][\"value\"])\n", + "print()\n", + "response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "76ad40ec-188b-4c2f-a5ee-c9f816cb67b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], + "source": [ + "search_params = {\n", + " \"q\": \"Virginia white-tailed deer\",\n", + " \"limit\": 1,\n", + " \"offset\": 0,\n", + " \"prefixes\": \"ncbitaxon\",\n", + " \"wikidata_fallback\": False\n", + "}\n", + "response = requests.get(\"http://mira-epi-dkg-lb-c7b58edea41524e6.elb.us-east-1.amazonaws.com:8771/api/search\", params=search_params)\n", + "print(response.json())" + ] + }, + { + "cell_type": "markdown", + "id": "8aede484-2e9d-4955-ac5d-ab6149b14edc", + "metadata": {}, + "source": [ + "## Add the ontology subtree full of terms under the specified term\n", + "\n", + "We use the same exact curie representing white-tailed deer and endpoint with the only difference being that we set the ```add_subtree``` flag to True." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f816fed8-a07b-410f-b1d5-1dc6976f64e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "params = {\"curie\": curie, \"add_subtree\": True}\n", + "response = requests.post(\"http://mira-epi-dkg-lb-c7b58edea41524e6.elb.us-east-1.amazonaws.com:8771/api/add_ontology_subtree\", params=params)\n", + "response.status_code" + ] + }, + { + "cell_type": "markdown", + "id": "2c258d50-9c5e-487e-bacd-053ca65f033a", + "metadata": {}, + "source": [ + "## Relationship info has been added\n", + "\n", + "Using the ```search``` endpoint for Virginia white-tailed deer now returns results" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ca6a8c94-2814-419b-8cf0-0e938b9b3e26", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Virginia white-tailed deer\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'id': 'ncbitaxon:9875',\n", + " 'name': 'Odocoileus virginianus virginianus',\n", + " 'type': 'class',\n", + " 'obsolete': False,\n", + " 'synonyms': [{'value': 'Virginia white-tailed deer', 'type': ''}],\n", + " 'xrefs': [{'id': 'gc_id:1', 'type': ''}],\n", + " 'labels': [],\n", + " 'properties': {'has_rank': ['NCBITaxon:subspecies']}}]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search_params = {\n", + " \"q\": \"Virginia white-tailed deer\",\n", + " \"limit\": 1,\n", + " \"offset\": 0,\n", + " \"prefixes\": \"ncbitaxon\",\n", + " \"wikidata_fallback\": False\n", + "}\n", + "response = requests.get(\"http://mira-epi-dkg-lb-c7b58edea41524e6.elb.us-east-1.amazonaws.com:8771/api/search\", params=search_params)\n", + "print(response.json()[0][\"synonyms\"][0][\"value\"])\n", + "print()\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "a0566fb6-e1b4-4b00-8b1b-b682b8ad60e7", + "metadata": {}, + "source": [ + "## Show the relation between white-tailed deer and Virginia white-tailed deer\n", + "\n", + "We then use the ```relations``` endpoint to query for the ```subclass_of``` relation between source node ```ncbitaxon:9875``` and target node ```ncbitaxon:9874```.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "bb1c69f4-c028-4e2f-9d87-f5da8dfa6803", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'subject': 'ncbitaxon:9875',\n", + " 'predicate': 'rdfs:subClassOf',\n", + " 'object': 'ncbitaxon:9874'}]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "relations_json = {\n", + " \"source_curie\": \"ncbitaxon:9875\",\n", + " \"target_curie\": \"ncbitaxon:9874\",\n", + " \"relations\": \"subclass_of\"\n", + "}\n", + "\n", + "response = requests.post(\"http://mira-epi-dkg-lb-c7b58edea41524e6.elb.us-east-1.amazonaws.com:8771/api/relations\", json=relations_json)\n", + "response.json()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}