Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addition of ontology terms and subtree terms #363

Merged
merged 11 commits into from
Sep 17, 2024
24 changes: 23 additions & 1 deletion mira/dkg/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from mira.dkg.client import AskemEntity, Entity, Relation
from mira.dkg.utils import DKG_REFINER_RELS
from mira.dkg.construct import add_resource_to_dkg
from mira.dkg.construct import add_resource_to_dkg, extract_ontology_subtree

__all__ = [
"api_blueprint",
Expand Down Expand Up @@ -360,6 +360,28 @@ def add_relations(
for relation in relation_list:
request.app.state.client.add_relation(relation)

@api_blueprint.post(
"/add_ontology_subtree",
response_model=None,
tags=["relations"],
)
def add_ontology_subtree(
request: Request,
curie: str = Query(..., example="ncbitaxon:9871"),
add_subtree: bool = False
):
"""Given a curie, add the entry it corresponds to from its respective
ontology as a node to the DKG.
Can enable the `add_subtree` flag to add all subtree entries."""
curie = curie.lower()
nodes, edges = extract_ontology_subtree(curie, add_subtree)
entities = [Entity(**node_info) for node_info in nodes]
relations = [Relation(**edge_info) for edge_info in edges]
for entity in entities:
request.app.state.client.add_node(entity)
for relation in relations:
request.app.state.client.add_relation(relation)


@api_blueprint.post(
"/add_resources",
Expand Down
135 changes: 132 additions & 3 deletions mira/dkg/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,16 @@
import click
import pyobo
import pystow
import networkx
from bioontologies import obograph
from bioontologies.obograph import Xref
from bioregistry import manager
from pydantic import BaseModel, Field
from pyobo.struct import part_of
from pyobo.struct import part_of, is_a
from pyobo.sources import ontology_resolver
from pyobo.getters import _ensure_ontology_path
from pyobo.api.utils import get_version
from pyobo.utils.path import prefix_directory_join
from obonet import read_obo
from tabulate import tabulate
from tqdm.auto import tqdm
from typing_extensions import Literal
Expand All @@ -54,7 +58,7 @@
from mira.dkg.physical_constants import get_physical_constant_terms
from mira.dkg.constants import EDGE_HEADER, NODE_HEADER
from mira.dkg.utils import PREFIXES
from mira.dkg.client import Synonym, Xref
from mira.dkg.models import Synonym, Xref
from mira.dkg.resources.cso import get_cso_obo
from mira.dkg.resources.geonames import get_geonames_terms
from mira.dkg.resources.extract_eiffel_ontology import get_eiffel_ontology_terms
Expand Down Expand Up @@ -421,6 +425,131 @@ def add_resource_to_dkg(resource_prefix: str):
# handle resource names that we don't process
return [], []

def extract_ontology_subtree(curie: str, add_subtree: bool = False):
"""Takes in a curie and extracts the information from the
entry in its respective resource ontology to add as a node into the
Epidemiology DKG.

There is an option to extract all the information from the entries
under the corresponding entry's subtree in its respective ontology.
Relation information is also extracted with this option.

Running this method for the first time for each specific resource will
take a long time (minutes) as the obo resource file has to be downloaded,
converted to a networkx graph, have their node indices normalized, and
pickled.

Subsequent runs of this method will take a few seconds as the pickled
graph object has to be loaded.

Parameters
----------
curie :
The curie for the entry that will be added as a node to the
Epidemiology DKG.
add_subtree :
Whether to add all the nodes and relations under the entry's subtree

Returns
-------
nodes : List[dict]
A list of node information added to the DKG, where each node is
represented as a dictionary.
edges : List[dict]
A list of edge information added to the DKG, where each edge is
represented as a dictionary.
"""
nodes, edges = [], []
resource_prefix = curie.split(":")[0]
if resource_prefix == "ncbitaxon":
type = "class"
version = get_version(resource_prefix)
cached_relabeled_obo_graph_path = prefix_directory_join(resource_prefix,
name="relabeled_obo_graph.pkl",
version=version)
if not cached_relabeled_obo_graph_path.exists():
_, obo_path = _ensure_ontology_path(resource_prefix, force=False,
version=version)
obo_graph = read_obo(obo_path)
relabeled_graph = networkx.relabel_nodes(obo_graph,
lambda node_index: node_index.lower())
with open(cached_relabeled_obo_graph_path,'wb') as relabeled_graph_file:
pickle.dump(relabeled_graph, relabeled_graph_file)
else:
with open(cached_relabeled_obo_graph_path,'rb') as relabeled_graph_file:
relabeled_graph = pickle.load(relabeled_graph_file)
else:
return nodes, edges

node = relabeled_graph.nodes.get(curie)
if not node:
return nodes, edges
if not add_subtree:
property_dict = defaultdict(list)
for text in node.get("property_value", []):
k, v = text.split(" ", 1)
property_dict[k].append(v)
nodes.append(
{
"id": curie,
"name": node["name"],
"type": type,
"description": "",
"obsolete": False,
"synonyms": [
Synonym(value=syn.split("\"")[1],
type="") for syn in
node.get("synonym", [])
],
"alts": [],
"xrefs": [Xref(id=xref_curie.lower(), type="")
for xref_curie in node["xref"]],
"properties": property_dict
}
)
return nodes, edges
else:
for node_curie in networkx.ancestors(relabeled_graph, curie) | {curie}:
node_curie = node_curie
node = relabeled_graph.nodes[node_curie]
property_dict = defaultdict(list)
for text in node.get("property_value", []):
k, v = text.split(" ",1)
property_dict[k].append(v)
nodes.append(
{
"id": node_curie,
"name": node["name"],
"type": type,
"description": "",
"obsolete": False,
"synonyms": [
Synonym(value=syn.split("\"")[1],
type="") for syn in
node.get("synonym", [])
],
"alts": [],
"xrefs": [Xref(id=xref_curie.lower(), type="")
for xref_curie in node.get("xref", [])],
"properties": property_dict
}
)
# Don't add relations where the original curie to add is the source
# of an is_a relation. Root nodes won't have an is_a relation.
if node_curie == curie or node["name"] == "root":
continue
edges.append(
{
"source_curie": node_curie,
"target_curie": node["is_a"][0].lower(),
"type": is_a.name.replace(" ","_"),
"pred": is_a.curie,
"source": resource_prefix,
"graph": resource_prefix,
"version": ""
}
)
return nodes, edges

@click.command()
@click.option(
Expand Down
Loading
Loading