Integrate with Biomappings

gyorilab · Jun 12, 2023 · 58b67dc · 58b67dc
1 parent 87dd59a
commit 58b67dc
Show file tree

Hide file tree

Showing 3 changed files with 261 additions and 4 deletions.
diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py
@@ -518,6 +518,9 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
         else:
             return curie_
 
+    biomappings_xref_graph = biomappings.get_true_graph()
+    added_biomappings = 0
+
     for prefix in use_case_paths.prefixes:
         if prefix in {"geonames", "uat", "probonto"}:  # added with custom code
             continue
@@ -589,6 +592,26 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                     for pred_curie, val_curie in properties:
                         property_predicates.append(pred_curie)
                         property_values.append(val_curie)
+
+                    xref_predicates, xref_references = [], []
+                    for xref in node.xrefs or []:
+                        if xref.prefix:
+                            xref_predicates.append(xref.pred)
+                            xref_references.append(xref.curie)
+
+                    if node.curie in biomappings_xref_graph:
+                        for xref_curie in biomappings_xref_graph.neighbors(node.curie):
+                            if ":" not in xref_curie:
+                                continue
+                            added_biomappings += 1
+                            xref_predicate = biomappings_xref_graph.edges[node.curie, xref_curie][
+                                "relation"
+                            ]
+                            if xref_predicate == "speciesSpecific":
+                                xref_predicate = "debio:0000003"
+                            xref_predicates.append(xref_predicate)
+                            xref_references.append(xref_curie)
+
                     nodes[curie] = NodeInfo(
                         curie=node.curie,
                         prefix=node.prefix,
@@ -608,14 +631,12 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                         .replace('"', "")
                         .replace("\n", " ")
                         .replace("  ", " "),
-                        xrefs=";".join(xref.curie for xref in node.xrefs if xref.prefix),
+                        xrefs=";".join(xref_references),
                         alts=";".join(node.alternative_ids),
                         version=version or "",
                         property_predicates=";".join(property_predicates),
                         property_values=";".join(property_values),
-                        xref_types=";".join(
-                            xref.pred for xref in node.xrefs or [] if xref.prefix
-                        ),
+                        xref_types=";".join(xref_predicates),
                         synonym_types=";".join(
                             synonym.pred for synonym in node.synonyms
                         ),

diff --git a/mira/dkg/construct_gilda_cache.py b/mira/dkg/construct_gilda_cache.py
@@ -0,0 +1,92 @@
+"""Construct a Gilda grounding cache for all terms in the graph."""
+
+import csv
+import gzip
+from typing import Iterable
+
+import click
+import pandas as pd
+from gilda.process import normalize
+from gilda.term import Term, filter_out_duplicates
+from tqdm import tqdm
+
+from mira.dkg.construct import GILDA_TERMS_PATH, NODES_PATH, upload_s3
+
+
+@click.command()
+@click.option("--upload")
+def main(upload):
+    _main(upload=upload)
+
+
+def _main(upload: bool):
+    terms = filter_out_duplicates(list(_iter_terms()))
+    header = [
+        "norm_text",
+        "text",
+        "db",
+        "id",
+        "entry_name",
+        "status",
+        "source",
+        "organism",
+        "source_db",
+        "source_id",
+    ]
+    with gzip.open(GILDA_TERMS_PATH, "wt", encoding="utf-8") as fh:
+        writer = csv.writer(fh, delimiter="\t")
+        writer.writerow(header)
+        writer.writerows(t.to_list() for t in terms)
+    if upload:
+        upload_s3(GILDA_TERMS_PATH)
+
+
+def _iter_terms() -> Iterable[Term]:
+    df = pd.read_csv(NODES_PATH, sep="\t")
+    it = tqdm(df.values, unit_scale=True, unit="node")
+    for (
+        curie,
+        _,
+        name,
+        synonyms,
+        _obsolete,
+        _type,
+        _description,
+        xrefs,
+        alts,
+        _version,
+        _prop_preds,
+        _prop_values,
+        xref_types,
+        synonym_types,
+        _sources,
+    ) in it:
+        if not name or pd.isna(name):
+            continue
+        prefix, identifier = curie.split(":", 1)
+        yield Term(
+            norm_text=normalize(name),
+            text=name,
+            db=prefix,
+            id=identifier,
+            entry_name=name,
+            status="name",
+            source=prefix,
+        )
+        if synonyms and not pd.isna(synonyms):
+            for synonym in synonyms.split(";"):
+                if not synonym.strip():
+                    continue
+                yield Term(
+                    norm_text=normalize(synonym),
+                    text=synonym,
+                    db=prefix,
+                    id=identifier,
+                    entry_name=name,
+                    status="synonym",
+                    source=prefix,
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mira/dkg/propose_mappings.py b/mira/dkg/propose_mappings.py
@@ -0,0 +1,144 @@
+"""A script to propose equivalent nodes in the DKG that aren't already mapped."""
+
+from collections import defaultdict
+from typing import List
+
+import biomappings
+import bioregistry
+import networkx as nx
+import pandas as pd
+from biomappings.resources import PredictionTuple, append_prediction_tuples
+from gilda.grounder import Grounder, ScoredMatch
+from tqdm import tqdm
+
+from mira.dkg.construct import GILDA_TERMS_PATH, NODES_PATH
+from mira.dkg.utils import PREFIXES
+
+source_whitelist = {
+    "apollosv",
+    "idomal",
+    "cemo",
+    "ido",
+    "vo",
+    "ovae",
+    "oae",
+    "cido",
+    "covoc",
+    "idocovid19",
+    "vido",
+}
+
+blacklist = {
+    "hp",
+    "doid",
+    "chebi",
+    "uberon",
+    "ncbitaxon",
+    "foaf",
+    "uo",
+    "oboinowl",
+    "owl",
+    "rdf",
+    "doi",
+    "pubmed",
+    "pmc",
+    "dc",
+    "debio",
+    "ro",
+    "bfo",
+    "iao",
+}
+
+
+def main():
+    """Propose mappings for curation in Biomappings."""
+    imported_prefixes = set(PREFIXES)
+
+    grounder = Grounder(GILDA_TERMS_PATH)
+
+    xref_graph = nx.Graph()
+    for mapping in tqdm(
+        biomappings.load_mappings(),
+        unit_scale=True,
+        unit="mapping",
+        desc="caching biomappings",
+    ):
+        source_prefix = mapping["source prefix"]
+        target_prefix = mapping["target prefix"]
+        if (
+            source_prefix not in imported_prefixes
+            or target_prefix not in imported_prefixes
+        ):
+            continue
+        xref_graph.add_edge(
+            bioregistry.curie_to_str(
+                *bioregistry.normalize_parsed_curie(
+                    source_prefix,
+                    mapping["source identifier"],
+                )
+            ),
+            bioregistry.curie_to_str(
+                *bioregistry.normalize_parsed_curie(
+                    target_prefix,
+                    mapping["target identifier"],
+                )
+            ),
+        )
+
+    xref_prefixes = defaultdict(set)
+    df = pd.read_csv(NODES_PATH, sep="\t")
+
+    for curie, xrefs in tqdm(
+        df[["id:ID", "xrefs:string[]"]].values,
+        unit_scale=True,
+        unit="node",
+        desc="caching xrefs",
+    ):
+        if not xrefs or pd.isna(xrefs):
+            continue
+        for xref in xrefs.split(";"):
+            xref_graph.add_edge(curie, xref)
+            xref_prefix = xref.split(":")[0]
+            if xref_prefix in imported_prefixes:
+                xref_prefixes[curie].add(xref_prefix)
+
+    idx = df["name:string"].notna()
+    rows = []
+    for curie, name in tqdm(
+        df[idx][["id:ID", "name:string"]].values,
+        unit_scale=True,
+        unit="node",
+        desc="Matching",
+    ):
+        prefix, identifier = curie.split(":", 1)
+        if prefix not in source_whitelist:
+            continue
+        scored_matches: List[ScoredMatch] = grounder.ground(name)
+        for scored_match in scored_matches:
+            term = scored_match.term
+            xref_prefix, xref_id = bioregistry.normalize_parsed_curie(
+                term.db, term.id
+            )
+            xref_curie = bioregistry.curie_to_str(xref_prefix, xref_id)
+            if prefix == xref_prefix or xref_graph.has_edge(curie, xref_curie):
+                continue
+            rows.append(
+                PredictionTuple(
+                    source_prefix=prefix,
+                    source_id=identifier,
+                    source_name=name,
+                    relation="skos:exactMatch",
+                    target_prefix=xref_prefix,
+                    target_identifier=xref_id,
+                    target_name=term.entry_name,
+                    type="lexical",
+                    confidence=scored_match.score,
+                    source="mira",
+                )
+            )
+
+    append_prediction_tuples(rows)
+
+
+if __name__ == "__main__":
+    main()