-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
261 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
"""Construct a Gilda grounding cache for all terms in the graph.""" | ||
|
||
import csv | ||
import gzip | ||
from typing import Iterable | ||
|
||
import click | ||
import pandas as pd | ||
from gilda.process import normalize | ||
from gilda.term import Term, filter_out_duplicates | ||
from tqdm import tqdm | ||
|
||
from mira.dkg.construct import GILDA_TERMS_PATH, NODES_PATH, upload_s3 | ||
|
||
|
||
@click.command() | ||
@click.option("--upload") | ||
def main(upload): | ||
_main(upload=upload) | ||
|
||
|
||
def _main(upload: bool): | ||
terms = filter_out_duplicates(list(_iter_terms())) | ||
header = [ | ||
"norm_text", | ||
"text", | ||
"db", | ||
"id", | ||
"entry_name", | ||
"status", | ||
"source", | ||
"organism", | ||
"source_db", | ||
"source_id", | ||
] | ||
with gzip.open(GILDA_TERMS_PATH, "wt", encoding="utf-8") as fh: | ||
writer = csv.writer(fh, delimiter="\t") | ||
writer.writerow(header) | ||
writer.writerows(t.to_list() for t in terms) | ||
if upload: | ||
upload_s3(GILDA_TERMS_PATH) | ||
|
||
|
||
def _iter_terms() -> Iterable[Term]: | ||
df = pd.read_csv(NODES_PATH, sep="\t") | ||
it = tqdm(df.values, unit_scale=True, unit="node") | ||
for ( | ||
curie, | ||
_, | ||
name, | ||
synonyms, | ||
_obsolete, | ||
_type, | ||
_description, | ||
xrefs, | ||
alts, | ||
_version, | ||
_prop_preds, | ||
_prop_values, | ||
xref_types, | ||
synonym_types, | ||
_sources, | ||
) in it: | ||
if not name or pd.isna(name): | ||
continue | ||
prefix, identifier = curie.split(":", 1) | ||
yield Term( | ||
norm_text=normalize(name), | ||
text=name, | ||
db=prefix, | ||
id=identifier, | ||
entry_name=name, | ||
status="name", | ||
source=prefix, | ||
) | ||
if synonyms and not pd.isna(synonyms): | ||
for synonym in synonyms.split(";"): | ||
if not synonym.strip(): | ||
continue | ||
yield Term( | ||
norm_text=normalize(synonym), | ||
text=synonym, | ||
db=prefix, | ||
id=identifier, | ||
entry_name=name, | ||
status="synonym", | ||
source=prefix, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
"""A script to propose equivalent nodes in the DKG that aren't already mapped.""" | ||
|
||
from collections import defaultdict | ||
from typing import List | ||
|
||
import biomappings | ||
import bioregistry | ||
import networkx as nx | ||
import pandas as pd | ||
from biomappings.resources import PredictionTuple, append_prediction_tuples | ||
from gilda.grounder import Grounder, ScoredMatch | ||
from tqdm import tqdm | ||
|
||
from mira.dkg.construct import GILDA_TERMS_PATH, NODES_PATH | ||
from mira.dkg.utils import PREFIXES | ||
|
||
source_whitelist = { | ||
"apollosv", | ||
"idomal", | ||
"cemo", | ||
"ido", | ||
"vo", | ||
"ovae", | ||
"oae", | ||
"cido", | ||
"covoc", | ||
"idocovid19", | ||
"vido", | ||
} | ||
|
||
blacklist = { | ||
"hp", | ||
"doid", | ||
"chebi", | ||
"uberon", | ||
"ncbitaxon", | ||
"foaf", | ||
"uo", | ||
"oboinowl", | ||
"owl", | ||
"rdf", | ||
"doi", | ||
"pubmed", | ||
"pmc", | ||
"dc", | ||
"debio", | ||
"ro", | ||
"bfo", | ||
"iao", | ||
} | ||
|
||
|
||
def main(): | ||
"""Propose mappings for curation in Biomappings.""" | ||
imported_prefixes = set(PREFIXES) | ||
|
||
grounder = Grounder(GILDA_TERMS_PATH) | ||
|
||
xref_graph = nx.Graph() | ||
for mapping in tqdm( | ||
biomappings.load_mappings(), | ||
unit_scale=True, | ||
unit="mapping", | ||
desc="caching biomappings", | ||
): | ||
source_prefix = mapping["source prefix"] | ||
target_prefix = mapping["target prefix"] | ||
if ( | ||
source_prefix not in imported_prefixes | ||
or target_prefix not in imported_prefixes | ||
): | ||
continue | ||
xref_graph.add_edge( | ||
bioregistry.curie_to_str( | ||
*bioregistry.normalize_parsed_curie( | ||
source_prefix, | ||
mapping["source identifier"], | ||
) | ||
), | ||
bioregistry.curie_to_str( | ||
*bioregistry.normalize_parsed_curie( | ||
target_prefix, | ||
mapping["target identifier"], | ||
) | ||
), | ||
) | ||
|
||
xref_prefixes = defaultdict(set) | ||
df = pd.read_csv(NODES_PATH, sep="\t") | ||
|
||
for curie, xrefs in tqdm( | ||
df[["id:ID", "xrefs:string[]"]].values, | ||
unit_scale=True, | ||
unit="node", | ||
desc="caching xrefs", | ||
): | ||
if not xrefs or pd.isna(xrefs): | ||
continue | ||
for xref in xrefs.split(";"): | ||
xref_graph.add_edge(curie, xref) | ||
xref_prefix = xref.split(":")[0] | ||
if xref_prefix in imported_prefixes: | ||
xref_prefixes[curie].add(xref_prefix) | ||
|
||
idx = df["name:string"].notna() | ||
rows = [] | ||
for curie, name in tqdm( | ||
df[idx][["id:ID", "name:string"]].values, | ||
unit_scale=True, | ||
unit="node", | ||
desc="Matching", | ||
): | ||
prefix, identifier = curie.split(":", 1) | ||
if prefix not in source_whitelist: | ||
continue | ||
scored_matches: List[ScoredMatch] = grounder.ground(name) | ||
for scored_match in scored_matches: | ||
term = scored_match.term | ||
xref_prefix, xref_id = bioregistry.normalize_parsed_curie( | ||
term.db, term.id | ||
) | ||
xref_curie = bioregistry.curie_to_str(xref_prefix, xref_id) | ||
if prefix == xref_prefix or xref_graph.has_edge(curie, xref_curie): | ||
continue | ||
rows.append( | ||
PredictionTuple( | ||
source_prefix=prefix, | ||
source_id=identifier, | ||
source_name=name, | ||
relation="skos:exactMatch", | ||
target_prefix=xref_prefix, | ||
target_identifier=xref_id, | ||
target_name=term.entry_name, | ||
type="lexical", | ||
confidence=scored_match.score, | ||
source="mira", | ||
) | ||
) | ||
|
||
append_prediction_tuples(rows) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |