Skip to content

Commit

Permalink
Merge pull request #95 from indralab/lexical-merge
Browse files Browse the repository at this point in the history
Add Biomappings support for the DKG
  • Loading branch information
cthoyt authored Jul 5, 2023
2 parents 7f849c7 + 8fe2d81 commit 65407db
Show file tree
Hide file tree
Showing 7 changed files with 313 additions and 33 deletions.
94 changes: 65 additions & 29 deletions mira/dkg/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pathlib import Path
from typing import Dict, NamedTuple, Sequence, Union, Optional

import biomappings
import bioontologies
import click
import pyobo
Expand Down Expand Up @@ -248,18 +249,18 @@ def construct(
for edge_node in edge_graph.nodes:
if edge_node.deprecated or edge_node.id.startswith("_:genid"):
continue
if not edge_node.lbl:
if not edge_node.name:
if edge_node.id in LABELS:
edge_node.lbl = LABELS[edge_node.id]
edge_node.name = LABELS[edge_node.id]
elif edge_node.prefix:
edge_node.lbl = edge_node.luid
edge_node.name = edge_node.identifier
else:
click.secho(f"missing label for {edge_node.curie}")
continue
if not edge_node.prefix:
tqdm.write(f"unparsable IRI: {edge_node.id} - {edge_node.lbl}")
tqdm.write(f"unparsable IRI: {edge_node.id} - {edge_node.name}")
continue
edge_names[edge_node.curie] = edge_node.lbl.strip()
edge_names[edge_node.curie] = edge_node.name.strip()
EDGE_NAMES_PATH.write_text(json.dumps(edge_names, sort_keys=True, indent=2))

# A mapping from CURIEs to node information tuples
Expand Down Expand Up @@ -463,7 +464,7 @@ def construct(
property_values="",
xref_types="", # TODO
synonym_types=";".join(
synonym.type or "skos:exactMatch" for synonym in term.synonyms or []
synonym.type.curie for synonym in term.synonyms or []
),
)
for parent in term.parents:
Expand Down Expand Up @@ -518,6 +519,9 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
else:
return curie_

biomappings_xref_graph = biomappings.get_true_graph()
added_biomappings = 0

for prefix in use_case_paths.prefixes:
if prefix in {"geonames", "uat", "probonto"}: # added with custom code
continue
Expand Down Expand Up @@ -560,13 +564,12 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
f"{manager.get_name(prefix)} ({len(_graphs)} graphs)", fg="green", bold=True
)
for graph in tqdm(_graphs, unit="graph", desc=prefix, leave=False):
if not graph.id:
raise ValueError(f"graph in {prefix} missing an ID")
graph_id = graph.id or prefix
version = graph.version
if version == "imports":
version = None
for node in graph.nodes:
if node.deprecated or not node.prefix or not node.luid:
if node.deprecated or not node.reference:
continue
if node.id.startswith("_:gen"): # skip blank nodes
continue
Expand All @@ -581,25 +584,45 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
if curie not in nodes or (curie in nodes and prefix == node.prefix):
# TODO filter out properties that are covered elsewhere
properties = sorted(
(prop.pred_curie, prop.val_curie)
(prop.predicate.curie, prop.value.curie)
for prop in node.properties
if prop.pred_prefix and prop.val_prefix
if prop.predicate and prop.value
)
property_predicates, property_values = [], []
for pred_curie, val_curie in properties:
property_predicates.append(pred_curie)
property_values.append(val_curie)

xref_predicates, xref_references = [], []
for xref in node.xrefs or []:
if xref.predicate and xref.value:
xref_predicates.append(xref.predicate.curie)
xref_references.append(xref.value.curie)

if node.curie in biomappings_xref_graph:
for xref_curie in biomappings_xref_graph.neighbors(node.curie):
if ":" not in xref_curie:
continue
added_biomappings += 1
xref_predicate = biomappings_xref_graph.edges[node.curie, xref_curie][
"relation"
]
if xref_predicate == "speciesSpecific":
xref_predicate = "debio:0000003"
xref_predicates.append(xref_predicate)
xref_references.append(xref_curie)

nodes[curie] = NodeInfo(
curie=node.curie,
prefix=node.prefix,
label=node.lbl.strip('"')
label=node.name.strip('"')
.strip()
.strip('"')
.replace("\n", " ")
.replace(" ", " ")
if node.lbl
if node.name
else "",
synonyms=";".join(synonym.val for synonym in node.synonyms),
synonyms=";".join(synonym.value for synonym in node.synonyms),
deprecated="true" if node.deprecated else "false", # type:ignore
# TODO better way to infer type based on hierarchy
# (e.g., if rdfs:type available, consider as instance)
Expand All @@ -608,16 +631,15 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
.replace('"', "")
.replace("\n", " ")
.replace(" ", " "),
xrefs=";".join(xref.curie for xref in node.xrefs if xref.prefix),
xrefs=";".join(xref_references),
alts=";".join(node.alternative_ids),
version=version or "",
property_predicates=";".join(property_predicates),
property_values=";".join(property_values),
xref_types=";".join(
xref.pred for xref in node.xrefs or [] if xref.prefix
),
xref_types=";".join(xref_predicates),
synonym_types=";".join(
synonym.pred for synonym in node.synonyms
synonym.predicate.curie if synonym.predicate else synonym.predicate_raw
for synonym in node.synonyms
),
)

Expand All @@ -629,7 +651,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
"replaced_by",
"iao:0100001",
prefix,
graph.id,
graph_id,
version or "",
)
)
Expand Down Expand Up @@ -668,7 +690,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
"xref",
"oboinowl:hasDbXref",
prefix,
graph.id,
graph_id,
version or "",
)
)
Expand All @@ -691,12 +713,15 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
synonym_types="",
)

for provenance_curie in node.get_provenance():
for provenance in node.get_provenance():
if ":" in provenance.identifier:
tqdm.write(f"Malformed provenance for {node.curie}")
provenance_curie = provenance.curie
node_sources[provenance_curie].add(prefix)
if provenance_curie not in nodes:
nodes[provenance_curie] = NodeInfo(
curie=provenance_curie,
prefix=provenance_curie.split(":")[0],
prefix=provenance.prefix,
label="",
synonyms="",
deprecated="false",
Expand All @@ -717,7 +742,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
"has_citation",
"debio:0000029",
prefix,
graph.id,
graph_id,
version or "",
)
)
Expand Down Expand Up @@ -751,25 +776,34 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
+ "\n"
)

unstandardized_nodes.extend(node.id for node in graph.nodes if not node.prefix)
unstandardized_nodes.extend(node.id for node in graph.nodes if not node.reference)
unstandardized_edges.extend(
edge.pred for edge in graph.edges if edge.pred.startswith("http")
)

clean_edges = (
edge
for edge in graph.edges
if (
edge.subject is not None
and edge.predicate is not None
and edge.object is not None
and edge.object.curie not in OBSOLETE
)
)
edges.extend(
(
edge.sub,
edge.obj,
_get_edge_name(edge.pred).lower().replace(" ", "_").replace("-", "_"),
edge.pred,
prefix,
graph.id,
graph_id,
version or "",
)
for edge in tqdm(
sorted(graph.edges, key=methodcaller("as_tuple")), unit="edge", unit_scale=True
sorted(clean_edges, key=methodcaller("as_tuple")), unit="edge", unit_scale=True
)
if edge.obj not in OBSOLETE
)

for sub, obj, pred_label, pred, *_ in edges:
Expand All @@ -787,6 +821,8 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
writer.writerows(edges)
tqdm.write(f"output edges to {edges_path}")

tqdm.write(f"incorporated {added_biomappings:,} xrefs from biomappings")

with gzip.open(use_case_paths.NODES_PATH, "wt") as file:
writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
writer.writerow(NODE_HEADER)
Expand Down Expand Up @@ -943,7 +979,7 @@ def get_node_info(term: pyobo.Term, type: EntityType = "class"):
property_values="",
xref_types="",
synonym_types=";".join(
synonym.type or "skos:exactMatch" for synonym in term.synonyms or []
synonym.type.curie for synonym in term.synonyms or []
),
)

Expand Down
92 changes: 92 additions & 0 deletions mira/dkg/construct_gilda_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Construct a Gilda grounding cache for all terms in the graph."""

import csv
import gzip
from typing import Iterable

import click
import pandas as pd
from gilda.process import normalize
from gilda.term import Term, filter_out_duplicates
from tqdm import tqdm

from mira.dkg.construct import GILDA_TERMS_PATH, NODES_PATH, upload_s3


@click.command()
@click.option("--upload")
def main(upload):
_main(upload=upload)


def _main(upload: bool):
terms = filter_out_duplicates(list(_iter_terms()))
header = [
"norm_text",
"text",
"db",
"id",
"entry_name",
"status",
"source",
"organism",
"source_db",
"source_id",
]
with gzip.open(GILDA_TERMS_PATH, "wt", encoding="utf-8") as fh:
writer = csv.writer(fh, delimiter="\t")
writer.writerow(header)
writer.writerows(t.to_list() for t in terms)
if upload:
upload_s3(GILDA_TERMS_PATH)


def _iter_terms() -> Iterable[Term]:
df = pd.read_csv(NODES_PATH, sep="\t")
it = tqdm(df.values, unit_scale=True, unit="node")
for (
curie,
_,
name,
synonyms,
_obsolete,
_type,
_description,
xrefs,
alts,
_version,
_prop_preds,
_prop_values,
xref_types,
synonym_types,
_sources,
) in it:
if not name or pd.isna(name):
continue
prefix, identifier = curie.split(":", 1)
yield Term(
norm_text=normalize(name),
text=name,
db=prefix,
id=identifier,
entry_name=name,
status="name",
source=prefix,
)
if synonyms and not pd.isna(synonyms):
for synonym in synonyms.split(";"):
if not synonym.strip():
continue
yield Term(
norm_text=normalize(synonym),
text=synonym,
db=prefix,
id=identifier,
entry_name=name,
status="synonym",
source=prefix,
)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions mira/dkg/construct_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
# Should be fixed in https://github.com/geneontology/go-ontology/pull/24148
# and after HP re-imports GO
"doi:10.1002/(SICI)1097-4687(199608)229:2<121::AID-JMOR1>3.0.CO;2-4",
# https://github.com/obophenotype/human-phenotype-ontology/pull/9812
"pubmed:14645606|PMID:14647932|PMID:31669363",
}
REMAPPING = {
REFERENCED_BY_SYMBOL: "debio:0000030",
Expand Down
1 change: 0 additions & 1 deletion mira/dkg/construct_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing import Optional, Set

import bioregistry
import bioregistry.app.impl
import click
from bioregistry import Manager
from tqdm import tqdm
Expand Down
Loading

0 comments on commit 65407db

Please sign in to comment.