Skip to content

Commit

Permalink
Integrate with Biomappings
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Jun 12, 2023
1 parent 87dd59a commit 58b67dc
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 4 deletions.
29 changes: 25 additions & 4 deletions mira/dkg/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,9 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
else:
return curie_

biomappings_xref_graph = biomappings.get_true_graph()
added_biomappings = 0

for prefix in use_case_paths.prefixes:
if prefix in {"geonames", "uat", "probonto"}: # added with custom code
continue
Expand Down Expand Up @@ -589,6 +592,26 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
for pred_curie, val_curie in properties:
property_predicates.append(pred_curie)
property_values.append(val_curie)

xref_predicates, xref_references = [], []
for xref in node.xrefs or []:
if xref.prefix:
xref_predicates.append(xref.pred)
xref_references.append(xref.curie)

if node.curie in biomappings_xref_graph:
for xref_curie in biomappings_xref_graph.neighbors(node.curie):
if ":" not in xref_curie:
continue
added_biomappings += 1
xref_predicate = biomappings_xref_graph.edges[node.curie, xref_curie][
"relation"
]
if xref_predicate == "speciesSpecific":
xref_predicate = "debio:0000003"
xref_predicates.append(xref_predicate)
xref_references.append(xref_curie)

nodes[curie] = NodeInfo(
curie=node.curie,
prefix=node.prefix,
Expand All @@ -608,14 +631,12 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
.replace('"', "")
.replace("\n", " ")
.replace(" ", " "),
xrefs=";".join(xref.curie for xref in node.xrefs if xref.prefix),
xrefs=";".join(xref_references),
alts=";".join(node.alternative_ids),
version=version or "",
property_predicates=";".join(property_predicates),
property_values=";".join(property_values),
xref_types=";".join(
xref.pred for xref in node.xrefs or [] if xref.prefix
),
xref_types=";".join(xref_predicates),
synonym_types=";".join(
synonym.pred for synonym in node.synonyms
),
Expand Down
92 changes: 92 additions & 0 deletions mira/dkg/construct_gilda_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Construct a Gilda grounding cache for all terms in the graph."""

import csv
import gzip
from typing import Iterable

import click
import pandas as pd
from gilda.process import normalize
from gilda.term import Term, filter_out_duplicates
from tqdm import tqdm

from mira.dkg.construct import GILDA_TERMS_PATH, NODES_PATH, upload_s3


@click.command()
@click.option("--upload")
def main(upload):
_main(upload=upload)


def _main(upload: bool):
terms = filter_out_duplicates(list(_iter_terms()))
header = [
"norm_text",
"text",
"db",
"id",
"entry_name",
"status",
"source",
"organism",
"source_db",
"source_id",
]
with gzip.open(GILDA_TERMS_PATH, "wt", encoding="utf-8") as fh:
writer = csv.writer(fh, delimiter="\t")
writer.writerow(header)
writer.writerows(t.to_list() for t in terms)
if upload:
upload_s3(GILDA_TERMS_PATH)


def _iter_terms() -> Iterable[Term]:
df = pd.read_csv(NODES_PATH, sep="\t")
it = tqdm(df.values, unit_scale=True, unit="node")
for (
curie,
_,
name,
synonyms,
_obsolete,
_type,
_description,
xrefs,
alts,
_version,
_prop_preds,
_prop_values,
xref_types,
synonym_types,
_sources,
) in it:
if not name or pd.isna(name):
continue
prefix, identifier = curie.split(":", 1)
yield Term(
norm_text=normalize(name),
text=name,
db=prefix,
id=identifier,
entry_name=name,
status="name",
source=prefix,
)
if synonyms and not pd.isna(synonyms):
for synonym in synonyms.split(";"):
if not synonym.strip():
continue
yield Term(
norm_text=normalize(synonym),
text=synonym,
db=prefix,
id=identifier,
entry_name=name,
status="synonym",
source=prefix,
)


if __name__ == "__main__":
main()
144 changes: 144 additions & 0 deletions mira/dkg/propose_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""A script to propose equivalent nodes in the DKG that aren't already mapped."""

from collections import defaultdict
from typing import List

import biomappings
import bioregistry
import networkx as nx
import pandas as pd
from biomappings.resources import PredictionTuple, append_prediction_tuples
from gilda.grounder import Grounder, ScoredMatch
from tqdm import tqdm

from mira.dkg.construct import GILDA_TERMS_PATH, NODES_PATH
from mira.dkg.utils import PREFIXES

source_whitelist = {
"apollosv",
"idomal",
"cemo",
"ido",
"vo",
"ovae",
"oae",
"cido",
"covoc",
"idocovid19",
"vido",
}

blacklist = {
"hp",
"doid",
"chebi",
"uberon",
"ncbitaxon",
"foaf",
"uo",
"oboinowl",
"owl",
"rdf",
"doi",
"pubmed",
"pmc",
"dc",
"debio",
"ro",
"bfo",
"iao",
}


def main():
"""Propose mappings for curation in Biomappings."""
imported_prefixes = set(PREFIXES)

grounder = Grounder(GILDA_TERMS_PATH)

xref_graph = nx.Graph()
for mapping in tqdm(
biomappings.load_mappings(),
unit_scale=True,
unit="mapping",
desc="caching biomappings",
):
source_prefix = mapping["source prefix"]
target_prefix = mapping["target prefix"]
if (
source_prefix not in imported_prefixes
or target_prefix not in imported_prefixes
):
continue
xref_graph.add_edge(
bioregistry.curie_to_str(
*bioregistry.normalize_parsed_curie(
source_prefix,
mapping["source identifier"],
)
),
bioregistry.curie_to_str(
*bioregistry.normalize_parsed_curie(
target_prefix,
mapping["target identifier"],
)
),
)

xref_prefixes = defaultdict(set)
df = pd.read_csv(NODES_PATH, sep="\t")

for curie, xrefs in tqdm(
df[["id:ID", "xrefs:string[]"]].values,
unit_scale=True,
unit="node",
desc="caching xrefs",
):
if not xrefs or pd.isna(xrefs):
continue
for xref in xrefs.split(";"):
xref_graph.add_edge(curie, xref)
xref_prefix = xref.split(":")[0]
if xref_prefix in imported_prefixes:
xref_prefixes[curie].add(xref_prefix)

idx = df["name:string"].notna()
rows = []
for curie, name in tqdm(
df[idx][["id:ID", "name:string"]].values,
unit_scale=True,
unit="node",
desc="Matching",
):
prefix, identifier = curie.split(":", 1)
if prefix not in source_whitelist:
continue
scored_matches: List[ScoredMatch] = grounder.ground(name)
for scored_match in scored_matches:
term = scored_match.term
xref_prefix, xref_id = bioregistry.normalize_parsed_curie(
term.db, term.id
)
xref_curie = bioregistry.curie_to_str(xref_prefix, xref_id)
if prefix == xref_prefix or xref_graph.has_edge(curie, xref_curie):
continue
rows.append(
PredictionTuple(
source_prefix=prefix,
source_id=identifier,
source_name=name,
relation="skos:exactMatch",
target_prefix=xref_prefix,
target_identifier=xref_id,
target_name=term.entry_name,
type="lexical",
confidence=scored_match.score,
source="mira",
)
)

append_prediction_tuples(rows)


if __name__ == "__main__":
main()

0 comments on commit 58b67dc

Please sign in to comment.