Merge pull request #95 from indralab/lexical-merge

Add Biomappings support for the DKG
gyorilab · Jul 5, 2023 · 65407db · 65407db
2 parents 7f849c7 + 8fe2d81
commit 65407db
Show file tree

Hide file tree

Showing 7 changed files with 313 additions and 33 deletions.
diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py
@@ -30,6 +30,7 @@
 from pathlib import Path
 from typing import Dict, NamedTuple, Sequence, Union, Optional
 
+import biomappings
 import bioontologies
 import click
 import pyobo
@@ -248,18 +249,18 @@ def construct(
                 for edge_node in edge_graph.nodes:
                     if edge_node.deprecated or edge_node.id.startswith("_:genid"):
                         continue
-                    if not edge_node.lbl:
+                    if not edge_node.name:
                         if edge_node.id in LABELS:
-                            edge_node.lbl = LABELS[edge_node.id]
+                            edge_node.name = LABELS[edge_node.id]
                         elif edge_node.prefix:
-                            edge_node.lbl = edge_node.luid
+                            edge_node.name = edge_node.identifier
                         else:
                             click.secho(f"missing label for {edge_node.curie}")
                             continue
                     if not edge_node.prefix:
-                        tqdm.write(f"unparsable IRI: {edge_node.id} - {edge_node.lbl}")
+                        tqdm.write(f"unparsable IRI: {edge_node.id} - {edge_node.name}")
                         continue
-                    edge_names[edge_node.curie] = edge_node.lbl.strip()
+                    edge_names[edge_node.curie] = edge_node.name.strip()
         EDGE_NAMES_PATH.write_text(json.dumps(edge_names, sort_keys=True, indent=2))
 
     # A mapping from CURIEs to node information tuples
@@ -463,7 +464,7 @@ def construct(
                 property_values="",
                 xref_types="",  # TODO
                 synonym_types=";".join(
-                    synonym.type or "skos:exactMatch" for synonym in term.synonyms or []
+                    synonym.type.curie for synonym in term.synonyms or []
                 ),
             )
             for parent in term.parents:
@@ -518,6 +519,9 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
         else:
             return curie_
 
+    biomappings_xref_graph = biomappings.get_true_graph()
+    added_biomappings = 0
+
     for prefix in use_case_paths.prefixes:
         if prefix in {"geonames", "uat", "probonto"}:  # added with custom code
             continue
@@ -560,13 +564,12 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
             f"{manager.get_name(prefix)} ({len(_graphs)} graphs)", fg="green", bold=True
         )
         for graph in tqdm(_graphs, unit="graph", desc=prefix, leave=False):
-            if not graph.id:
-                raise ValueError(f"graph in {prefix} missing an ID")
+            graph_id = graph.id or prefix
             version = graph.version
             if version == "imports":
                 version = None
             for node in graph.nodes:
-                if node.deprecated or not node.prefix or not node.luid:
+                if node.deprecated or not node.reference:
                     continue
                 if node.id.startswith("_:gen"):  # skip blank nodes
                     continue
@@ -581,25 +584,45 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                 if curie not in nodes or (curie in nodes and prefix == node.prefix):
                     # TODO filter out properties that are covered elsewhere
                     properties = sorted(
-                        (prop.pred_curie, prop.val_curie)
+                        (prop.predicate.curie, prop.value.curie)
                         for prop in node.properties
-                        if prop.pred_prefix and prop.val_prefix
+                        if prop.predicate and prop.value
                     )
                     property_predicates, property_values = [], []
                     for pred_curie, val_curie in properties:
                         property_predicates.append(pred_curie)
                         property_values.append(val_curie)
+
+                    xref_predicates, xref_references = [], []
+                    for xref in node.xrefs or []:
+                        if xref.predicate and xref.value:
+                            xref_predicates.append(xref.predicate.curie)
+                            xref_references.append(xref.value.curie)
+
+                    if node.curie in biomappings_xref_graph:
+                        for xref_curie in biomappings_xref_graph.neighbors(node.curie):
+                            if ":" not in xref_curie:
+                                continue
+                            added_biomappings += 1
+                            xref_predicate = biomappings_xref_graph.edges[node.curie, xref_curie][
+                                "relation"
+                            ]
+                            if xref_predicate == "speciesSpecific":
+                                xref_predicate = "debio:0000003"
+                            xref_predicates.append(xref_predicate)
+                            xref_references.append(xref_curie)
+
                     nodes[curie] = NodeInfo(
                         curie=node.curie,
                         prefix=node.prefix,
-                        label=node.lbl.strip('"')
+                        label=node.name.strip('"')
                         .strip()
                         .strip('"')
                         .replace("\n", " ")
                         .replace("  ", " ")
-                        if node.lbl
+                        if node.name
                         else "",
-                        synonyms=";".join(synonym.val for synonym in node.synonyms),
+                        synonyms=";".join(synonym.value for synonym in node.synonyms),
                         deprecated="true" if node.deprecated else "false",  # type:ignore
                         # TODO better way to infer type based on hierarchy
                         #  (e.g., if rdfs:type available, consider as instance)
@@ -608,16 +631,15 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                         .replace('"', "")
                         .replace("\n", " ")
                         .replace("  ", " "),
-                        xrefs=";".join(xref.curie for xref in node.xrefs if xref.prefix),
+                        xrefs=";".join(xref_references),
                         alts=";".join(node.alternative_ids),
                         version=version or "",
                         property_predicates=";".join(property_predicates),
                         property_values=";".join(property_values),
-                        xref_types=";".join(
-                            xref.pred for xref in node.xrefs or [] if xref.prefix
-                        ),
+                        xref_types=";".join(xref_predicates),
                         synonym_types=";".join(
-                            synonym.pred for synonym in node.synonyms
+                            synonym.predicate.curie if synonym.predicate else synonym.predicate_raw
+                            for synonym in node.synonyms
                         ),
                     )
 
@@ -629,7 +651,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                             "replaced_by",
                             "iao:0100001",
                             prefix,
-                            graph.id,
+                            graph_id,
                             version or "",
                         )
                     )
@@ -668,7 +690,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                                 "xref",
                                 "oboinowl:hasDbXref",
                                 prefix,
-                                graph.id,
+                                graph_id,
                                 version or "",
                             )
                         )
@@ -691,12 +713,15 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                                 synonym_types="",
                             )
 
-                for provenance_curie in node.get_provenance():
+                for provenance in node.get_provenance():
+                    if ":" in provenance.identifier:
+                        tqdm.write(f"Malformed provenance for {node.curie}")
+                    provenance_curie = provenance.curie
                     node_sources[provenance_curie].add(prefix)
                     if provenance_curie not in nodes:
                         nodes[provenance_curie] = NodeInfo(
                             curie=provenance_curie,
-                            prefix=provenance_curie.split(":")[0],
+                            prefix=provenance.prefix,
                             label="",
                             synonyms="",
                             deprecated="false",
@@ -717,7 +742,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                             "has_citation",
                             "debio:0000029",
                             prefix,
-                            graph.id,
+                            graph_id,
                             version or "",
                         )
                     )
@@ -751,25 +776,34 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                     + "\n"
                 )
 
-            unstandardized_nodes.extend(node.id for node in graph.nodes if not node.prefix)
+            unstandardized_nodes.extend(node.id for node in graph.nodes if not node.reference)
             unstandardized_edges.extend(
                 edge.pred for edge in graph.edges if edge.pred.startswith("http")
             )
 
+            clean_edges = (
+                edge
+                for edge in graph.edges
+                if (
+                    edge.subject is not None
+                    and edge.predicate is not None
+                    and edge.object is not None
+                    and edge.object.curie not in OBSOLETE
+                )
+            )
             edges.extend(
                 (
                     edge.sub,
                     edge.obj,
                     _get_edge_name(edge.pred).lower().replace(" ", "_").replace("-", "_"),
                     edge.pred,
                     prefix,
-                    graph.id,
+                    graph_id,
                     version or "",
                 )
                 for edge in tqdm(
-                    sorted(graph.edges, key=methodcaller("as_tuple")), unit="edge", unit_scale=True
+                    sorted(clean_edges, key=methodcaller("as_tuple")), unit="edge", unit_scale=True
                 )
-                if edge.obj not in OBSOLETE
             )
 
         for sub, obj, pred_label, pred, *_ in edges:
@@ -787,6 +821,8 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
             writer.writerows(edges)
         tqdm.write(f"output edges to {edges_path}")
 
+    tqdm.write(f"incorporated {added_biomappings:,} xrefs from biomappings")
+
     with gzip.open(use_case_paths.NODES_PATH, "wt") as file:
         writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
         writer.writerow(NODE_HEADER)
@@ -943,7 +979,7 @@ def get_node_info(term: pyobo.Term, type: EntityType = "class"):
         property_values="",
         xref_types="",
         synonym_types=";".join(
-            synonym.type or "skos:exactMatch" for synonym in term.synonyms or []
+            synonym.type.curie for synonym in term.synonyms or []
         ),
     )
 

diff --git a/mira/dkg/construct_gilda_cache.py b/mira/dkg/construct_gilda_cache.py
@@ -0,0 +1,92 @@
+"""Construct a Gilda grounding cache for all terms in the graph."""
+
+import csv
+import gzip
+from typing import Iterable
+
+import click
+import pandas as pd
+from gilda.process import normalize
+from gilda.term import Term, filter_out_duplicates
+from tqdm import tqdm
+
+from mira.dkg.construct import GILDA_TERMS_PATH, NODES_PATH, upload_s3
+
+
+@click.command()
+@click.option("--upload")
+def main(upload):
+    _main(upload=upload)
+
+
+def _main(upload: bool):
+    terms = filter_out_duplicates(list(_iter_terms()))
+    header = [
+        "norm_text",
+        "text",
+        "db",
+        "id",
+        "entry_name",
+        "status",
+        "source",
+        "organism",
+        "source_db",
+        "source_id",
+    ]
+    with gzip.open(GILDA_TERMS_PATH, "wt", encoding="utf-8") as fh:
+        writer = csv.writer(fh, delimiter="\t")
+        writer.writerow(header)
+        writer.writerows(t.to_list() for t in terms)
+    if upload:
+        upload_s3(GILDA_TERMS_PATH)
+
+
+def _iter_terms() -> Iterable[Term]:
+    df = pd.read_csv(NODES_PATH, sep="\t")
+    it = tqdm(df.values, unit_scale=True, unit="node")
+    for (
+        curie,
+        _,
+        name,
+        synonyms,
+        _obsolete,
+        _type,
+        _description,
+        xrefs,
+        alts,
+        _version,
+        _prop_preds,
+        _prop_values,
+        xref_types,
+        synonym_types,
+        _sources,
+    ) in it:
+        if not name or pd.isna(name):
+            continue
+        prefix, identifier = curie.split(":", 1)
+        yield Term(
+            norm_text=normalize(name),
+            text=name,
+            db=prefix,
+            id=identifier,
+            entry_name=name,
+            status="name",
+            source=prefix,
+        )
+        if synonyms and not pd.isna(synonyms):
+            for synonym in synonyms.split(";"):
+                if not synonym.strip():
+                    continue
+                yield Term(
+                    norm_text=normalize(synonym),
+                    text=synonym,
+                    db=prefix,
+                    id=identifier,
+                    entry_name=name,
+                    status="synonym",
+                    source=prefix,
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mira/dkg/construct_rdf.py b/mira/dkg/construct_rdf.py
@@ -27,6 +27,8 @@
     # Should be fixed in https://github.com/geneontology/go-ontology/pull/24148
     # and after HP re-imports GO
     "doi:10.1002/(SICI)1097-4687(199608)229:2<121::AID-JMOR1>3.0.CO;2-4",
+    # https://github.com/obophenotype/human-phenotype-ontology/pull/9812
+    "pubmed:14645606|PMID:14647932|PMID:31669363",
 }
 REMAPPING = {
     REFERENCED_BY_SYMBOL: "debio:0000030",

diff --git a/mira/dkg/construct_registry.py b/mira/dkg/construct_registry.py
@@ -9,7 +9,6 @@
 from typing import Optional, Set
 
 import bioregistry
-import bioregistry.app.impl
 import click
 from bioregistry import Manager
 from tqdm import tqdm