Skip to content

Commit

Permalink
Merge pull request #883 from jhu-bids/voc-refresh-graph-update
Browse files Browse the repository at this point in the history
Vocab refresh: Update `networkx` graphs
  • Loading branch information
joeflack4 authored Nov 18, 2024
2 parents 33aea3e + ddc5a03 commit fdb472e
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 153 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/test_backend_e2e_and_unit_and_qc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ jobs:
steps:
- name: Check out repository
uses: actions/checkout@v3
with:
submodules: true
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
Expand Down
53 changes: 25 additions & 28 deletions backend/db/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def get_dependent_tables_queue(independent_tables: Union[List[str], str], _filte
elif _filter == 'tables':
return [x for x in queue if x not in views]

# todo: temp fix: see "DDL" comment in docstring
# todo - temporary fix: see "DDL" comment in docstring
ddl_order_based_queue: List[str] = order_modules_by_ddl_order(queue)

return ddl_order_based_queue
Expand Down Expand Up @@ -336,6 +336,28 @@ def current_datetime(time_zone=TIMEZONE_DEFAULT) -> str:
return tz_datetime_str(datetime.now(), time_zone=time_zone)


# todo: Can update update_db_status_var() so that it can accept optional param 'con' to improve performance.
def update_db_status_var(key: str, val: str, local=False):
"""Update the `manage` table with information for a given variable, e.g. when a table was last updated"""
with get_db_connection(schema='', local=local) as con:
run_sql(con, f"DELETE FROM public.manage WHERE key = '{key}';")
sql_str = f"INSERT INTO public.manage (key, value) VALUES (:key, :val);"
run_sql(con, sql_str, {'key': key, 'val': val})


def check_db_status_var(key: str, local=False):
"""Check the value of a given variable the `manage`table """
with get_db_connection(schema='', local=local) as con:
results: List = sql_query_single_col(con, f"SELECT value FROM public.manage WHERE key = '{key}';")
return results[0] if results else None


def delete_db_status_var(key: str, local=False):
"""Delete information from the `manage` table """
with get_db_connection(schema='', local=local) as con2:
run_sql(con2, f"DELETE FROM public.manage WHERE key = '{key}';")


def last_refresh_timestamp(con: Connection) -> str:
"""Get the timestamp of the last database refresh"""
results: List[List] = sql_query(
Expand All @@ -352,14 +374,11 @@ def is_up_to_date(last_updated: Union[datetime, str], threshold_hours=24) -> boo
return hours_since_update < threshold_hours


def check_if_updated(key: str, skip_if_updated_within_hours: int = None) -> bool:
def check_if_updated(key: str, skip_if_updated_within_hours: int = None, local=False) -> bool:
"""Check if table is up to date"""
with get_db_connection(schema='') as con2:
results: List[List] = sql_query(con2, f"SELECT value FROM public.manage WHERE key = '{key}';", return_with_keys=False)
last_updated = results[0][0] if results else None
last_updated: str = check_db_status_var(key, local)
return last_updated and is_up_to_date(last_updated, skip_if_updated_within_hours)


def is_table_up_to_date(table_name: str, skip_if_updated_within_hours: int = None) -> bool:
"""Check if table is up to date"""
if not skip_if_updated_within_hours:
Expand Down Expand Up @@ -430,28 +449,6 @@ def is_derived_refresh_active(local=False) -> bool:
return is_refresh_active('derived', local)


# todo: Can update update_db_status_var() so that it can accept optional param 'con' to improve performance.
def update_db_status_var(key: str, val: str, local=False):
"""Update the `manage` table with information for a given variable, e.g. when a table was last updated"""
with get_db_connection(schema='', local=local) as con:
run_sql(con, f"DELETE FROM public.manage WHERE key = '{key}';")
sql_str = f"INSERT INTO public.manage (key, value) VALUES (:key, :val);"
run_sql(con, sql_str, {'key': key, 'val': val})


def check_db_status_var(key: str, local=False):
"""Check the value of a given variable the `manage`table """
with get_db_connection(schema='', local=local) as con:
results: List = sql_query_single_col(con, f"SELECT value FROM public.manage WHERE key = '{key}';")
return results[0] if results else None


def delete_db_status_var(key: str, local=False):
"""Delete information from the `manage` table """
with get_db_connection(schema='', local=local) as con2:
run_sql(con2, f"DELETE FROM public.manage WHERE key = '{key}';")


def insert_fetch_statuses(rows: List[Dict], local=False):
"""Update fetch status of record
:param: rows: expects keys 'comment', 'primary_key', 'table', and 'status_initially'."""
Expand Down
106 changes: 38 additions & 68 deletions backend/routes/graph.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,20 @@
"""Graph related functions and routes"""
import os, warnings
# import csv
# import io
# import json
import dateutil.parser as dp
from datetime import datetime
from pathlib import Path
from typing import Any, Iterable, List, Set, Tuple, Union, Dict, Optional

import pickle
import networkx as nx
# import pydot
from fastapi import APIRouter, Query, Request
from networkx import DiGraph
from sqlalchemy import Row, RowMapping
from sqlalchemy.sql import text

# from fastapi.responses import JSONResponse
# from fastapi.responses import Response
# from fastapi.encoders import jsonable_encoder
# from collections import OrderedDict
# from igraph import Graph
# from networkx.drawing.nx_pydot import to_pydot, from_pydot

from backend.routes.db import get_cset_members_items
from backend.db.queries import get_concepts
from backend.db.utils import get_db_connection, SCHEMA
from backend.db.utils import check_db_status_var, get_db_connection, SCHEMA
from backend.api_logger import Api_logger
from backend.utils import get_timer, commify

Expand All @@ -45,17 +36,15 @@ async def concept_graph_get(
) -> Dict[str, Any]:
"""Return concept graph"""
cids = cids if cids else []
return await concept_graph_post(request, codeset_ids, cids, hide_vocabs,
hide_nonstandard_concepts, verbose)
return await concept_graph_post(request, codeset_ids, cids, hide_vocabs, hide_nonstandard_concepts, verbose)


# todo: match return of concept_graph()
@router.post("/concept-graph")
async def concept_graph_post(
request: Request, codeset_ids: List[int], cids: Union[List[int], None] = [],
hide_vocabs = ['RxNorm Extension'], hide_nonstandard_concepts=False, verbose = VERBOSE,
) -> List[List[Union[int, Any]]]:

) -> Dict:
"""Return concept graph via HTTP POST"""
rpt = Api_logger()
try:
await rpt.start_rpt(request, params={'codeset_ids': codeset_ids, 'cids': cids})
Expand Down Expand Up @@ -143,19 +132,23 @@ async def concept_graph(
return sg, concept_ids, more_concept_ids, hidden_by_voc, nonstandard_concepts_hidden


def get_all_descendants(G: nx.DiGraph, subgraph_nodes: Union[List[int], Set[int]], verbose=VERBOSE) -> Set:
# using this instead of get_missing_in_between_nodes. this way the front end has the entire
# descendant tree for all concepts being looked at
def get_all_descendants(g: nx.DiGraph, subgraph_nodes: Union[List[int], Set[int]]) -> Set[int]:
"""Get all descendants of a set of nodes
Using this instead of get_missing_in_between_nodes. this way the front end has the entire descendant tree for all
concepts being looked at.
"""
descendants: Set[int] = set()
for node in subgraph_nodes:
if G.has_node(node):
descendants.update(G.successors(node))
if g.has_node(node):
descendants.update(g.successors(node))
return descendants


# TODO: @Siggie: move below to frontend
# noinspection PyPep8Naming
def MOVE_TO_FRONT_END():
"""Move to front end"""
hidden_by_voc = {}
hide_if_over = 50
tree = [] # this used to be indented tree stuff that we're no longer using
Expand Down Expand Up @@ -204,9 +197,13 @@ def filter_concepts(
print_stack = lambda s: ' | '.join([f"{n} => {','.join([str(x) for x in p])}" for n,p in s])


# not using this anymore
# noinspection PyPep8Naming
def get_missing_in_between_nodes(G: nx.DiGraph, subgraph_nodes: Union[List[int], Set[int]], verbose=VERBOSE) -> Set:
# not using this anymore
"""Find any missing nodes that exist in a subgraph.
This can happen when a concept set expansions that are indirect subtrees of other expansions.
For"""
missing_in_between_nodes = set()
missing_in_between_nodes_tmp = set()
subgraph_nodes = set(subgraph_nodes)
Expand Down Expand Up @@ -266,23 +263,6 @@ def get_missing_in_between_nodes(G: nx.DiGraph, subgraph_nodes: Union[List[int],
return missing_in_between_nodes


def test_get_missing_in_between_nodes(
whole_graph_edges=None, non_subgraph_nodes=None, expected_missing_in_between_nodes=None, subgraph_nodes=None,
fail=True, verbose=False
):
# add code to load whole REL_GRAPH
G = DiGraph(whole_graph_edges)
subgraph_nodes = subgraph_nodes or set(G.nodes) - set(non_subgraph_nodes)
missing_in_between_nodes = get_missing_in_between_nodes(G, subgraph_nodes, verbose=verbose)
if fail:
assert missing_in_between_nodes == set(expected_missing_in_between_nodes)
else:
if missing_in_between_nodes == set(expected_missing_in_between_nodes):
print(f"passed with {missing_in_between_nodes}")
else:
print(f"expected {expected_missing_in_between_nodes}, got {missing_in_between_nodes}")


@router.get("/wholegraph")
def wholegraph():
"""Get subgraph edges for the whole graph"""
Expand Down Expand Up @@ -392,23 +372,26 @@ def create_rel_graphs(save_to_pickle: bool) -> DiGraph:
return G # , Gu


def load_relationship_graph(save_if_not_exists=True):
def is_graph_up_to_date(graph_path: str = GRAPH_PATH) -> bool:
"""Determine if the networkx relationship_graph derived from OMOP vocab is current"""
voc_last_updated = dp.parse(check_db_status_var('last_refreshed_vocab_tables'))
graph_last_updated = datetime.fromtimestamp(os.path.getmtime(graph_path))
if voc_last_updated.tzinfo and not graph_last_updated.tzinfo: # if one has timezone, both need
graph_last_updated = graph_last_updated.replace(tzinfo=voc_last_updated.tzinfo)
return graph_last_updated > voc_last_updated


# noinspection PyPep8Naming for_G
def load_relationship_graph(graph_path: str = GRAPH_PATH, update_if_outdated=True, save=True) -> DiGraph:
"""Load relationship graph from disk"""
timer = get_timer('./load_relationship_graph')
timer(f'loading {GRAPH_PATH}')
if os.path.isfile(GRAPH_PATH):
with open(GRAPH_PATH, 'rb') as pickle_file:
# noinspection PyPep8Naming
G = pickle.load(pickle_file)
# while True:
# try:
# chunk = pickle.load(pickle_file)
# G.add_edges_from(chunk)
# except EOFError:
# break # End of file reached
timer(f'loading {graph_path}')
up_to_date = True if not update_if_outdated else is_graph_up_to_date(GRAPH_PATH)
if os.path.isfile(graph_path) and up_to_date:
with open(graph_path, 'rb') as pickle_file:
G: DiGraph = pickle.load(pickle_file)
else:
# noinspection PyPep8Naming
G = create_rel_graphs(save_if_not_exists)
G: DiGraph = create_rel_graphs(save)
timer('done')
return G

Expand All @@ -423,20 +406,7 @@ def load_relationship_graph(save_if_not_exists=True):
# import builtins
# builtins.DONT_LOAD_GRAPH = True
import builtins

if hasattr(builtins, 'DONT_LOAD_GRAPH') and builtins.DONT_LOAD_GRAPH:
warnings.warn('not loading relationship graph')
else:
REL_GRAPH = load_relationship_graph(save_if_not_exists=True)
# REVERSE_GRAPH = REL_GRAPH.reverse()

# G_ROOTS = set([n for n in REL_GRAPH.nodes if REL_GRAPH.in_degree(n) == 0])
# def distance_to_root(G, node):
# n = node
# d = 0
# for p in G.predecessors(node):
# if n in G_ROOTS:
# return d
# d += 1
# n = p
# raise Exception(f"can't find root for {node}")
REL_GRAPH = load_relationship_graph()
90 changes: 90 additions & 0 deletions test/attic/testing_get_missing_in_between_nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Testing get_missing_in_between_nodes()
TODO: As of 2024/11 this is deprecated. Siggie made a note on get_missing_in_between_nodes() that it is not being used
anymore.
If we want to re-use this, should do the following:
i. correct imports so it will work (this was previously at the root dir)
ii. Use unittest module to execute these tests
todo: @Siggie: (1) Should this be a proper Python test file?, (2) I moved test_get_missing_in_between_nodes() here
because the Python debugger was making that file run this as a test instead of doing what I wanted. But IDK if this
breaks anything for you.
"""
import networkx as nx
import builtins

from networkx import DiGraph

builtins.DONT_LOAD_GRAPH = True
# from backend.routes.graph import get_missing_in_between_nodes, test_get_missing_in_between_nodes
from backend.routes.graph import get_missing_in_between_nodes

print_stack = lambda s: '; '.join([f"""{n}{'<--' if p else ''}{','.join(p)}""" for n,p in reversed(s)])


def testing_get_missing_in_between_nodes(
whole_graph_edges=None, non_subgraph_nodes=None, expected_missing_in_between_nodes=None, subgraph_nodes=None,
fail=True, verbose=False
):
# add code to load whole REL_GRAPH
# noinspection PyPep8Naming
G = DiGraph(whole_graph_edges)
subgraph_nodes = subgraph_nodes or set(G.nodes) - set(non_subgraph_nodes)
missing_in_between_nodes = get_missing_in_between_nodes(G, subgraph_nodes, verbose=verbose)
if fail:
assert missing_in_between_nodes == set(expected_missing_in_between_nodes)
else:
if missing_in_between_nodes == set(expected_missing_in_between_nodes):
print(f"passed with {missing_in_between_nodes}")
else:
print(f"expected {expected_missing_in_between_nodes}, got {missing_in_between_nodes}")


def get_missing(edges, subgraph_nodes, verbose=False):
G = nx.DiGraph(edges)
n = get_missing_in_between_nodes(G, subgraph_nodes, verbose=verbose)
print(f"Found {n} missing nodes\n")
return n

graph_as_ascii = """
a
/ \
b i
/ |
| j
c___/
|
d___
/ \ \
e h g
| |
\ /
f
"""
test_edges = [
('a', 'b'),
('c', 'd'),
('a', 'i'),
('i', 'j'),
('b', 'c'),
('j', 'c'),
('d', 'e'),
('d', 'g'),
('d', 'h'),
('e', 'f'),
('h', 'f'),
# ('j', 'k'), # extra nodes beyond the real-life example, for some other testing
# ('l', 'h'),
# ('k', 'h'),
]
# subgraph_nodes = ['f', 'd', ]
# expected_missing_in_between_nodes = ['e', 'h']
subgraph_nodes = ['f', 'd', 'i',]
expected_missing_in_between_nodes = ['e', 'h', 'c', 'j']
# subgraph_nodes = ['f', 'd', 'g', 'k']

testing_get_missing_in_between_nodes(test_edges,
subgraph_nodes=subgraph_nodes,
expected_missing_in_between_nodes=expected_missing_in_between_nodes,
fail=False)
# get_missing(test_edges, subgraph_nodes)
13 changes: 11 additions & 2 deletions test/test_backend/routes/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,16 @@
THIS_DIR = Path(os.path.dirname(__file__))
PROJECT_ROOT = THIS_DIR.parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# todo: https://github.com/jhu-bids/TermHub/issues/784 . Failing as of https://github.com/jhu-bids/TermHub/pull/883 ,
# but examining the diff, it's not obvious why. Pickle didn't change. Loading of pickle essentially unchanged.
import builtins
builtins.DONT_LOAD_GRAPH = True
from backend.routes.graph import concept_graph, get_missing_in_between_nodes
# noinspection PyUnresolvedReferences rel_graph_exists_just_not_if_name_eq_main
from backend.routes.graph import concept_graph, REL_GRAPH, get_missing_in_between_nodes
# from backend.routes.graph import concept_graph, REL_GRAPH, get_missing_in_between_nodes
REL_GRAPH = DiGraph()


THIS_STATIC_DIR = THIS_DIR / 'static'
STATIC_DIR_concept_graph = THIS_STATIC_DIR / 'concept_graph'
Expand Down Expand Up @@ -214,7 +222,8 @@ async def test_concept_graph2(self):
self.assertEquals(len(nonstandard_concepts_hidden), 19)
self.assertEquals(len(hidden_by_voc[hide_vocabs[0]]), 92)

# TODO: Upgrade for multiple scenarios: https://github.com/jhu-bids/TermHub/issues/784
# todo: Upgrade for multiple scenarios & fix in GH action: https://github.com/jhu-bids/TermHub/issues/784
@unittest.skip("Not using tested function anymore.")
async def test_get_missing_in_between_nodes(self, verbose=False):
"""Test get_missing_in_between_nodes()"""
# - Test: Gap filling
Expand Down
Loading

0 comments on commit fdb472e

Please sign in to comment.