Skip to content

Commit

Permalink
Merge branch 'amia-paper-joe' of github.com:jhu-bids/TermHub into ami…
Browse files Browse the repository at this point in the history
…a-paper-joe
  • Loading branch information
Sigfried committed Mar 18, 2024
2 parents 00e29d0 + 44f7a3c commit 59e9dad
Show file tree
Hide file tree
Showing 2 changed files with 307 additions and 112 deletions.
81 changes: 2 additions & 79 deletions backend/routes/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,22 +726,13 @@ def next_api_call_group_id() -> int:
return id


# todo: can / should we replace this query with selecting from `apijoin` table instead?
def usage_query(verbose=True) -> List[Dict]:
"""Query for usage data
Filters out problematic api_call_group_id where the call group is amibiguous (-1 or NULL)"""
t0 = datetime.now()
with get_db_connection() as con:
# what we had originally
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM public.apiruns_grouped g
# RIGHT JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 530,232
# todo: siggie says this same as inner join? how/why? does that matter?
# answer: a left join includes all rows of the left table regardless of whether they
# match the right table. this leaves all columns from the right table null.
# but if you require any column from the right table to be not null, then
# it's the same as an inner join. -- whether it matters here, I don't know.
data: List[RowMapping] = sql_query(con, """
SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time,
date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
Expand All @@ -750,75 +741,7 @@ def usage_query(verbose=True) -> List[Dict]:
LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id
WHERE g.api_call_group_id != -1 AND g.api_call_group_id IS NOT NULL;
-- WHERE g.api_call_group_id = -1 or g.api_call_group_id IS NULL;
""") # 13,210

# todo: temp: cleanup at end of paper
# --- good?
# 'filter api_call_group = -1' - multiple variations
# - chaning 'IS NULL' to 'IS NOT NULL" turned query from a few seconds into 70 seconds before i canceled
# - as i suspected (the AI was wrong), there *are* NULL records in here (604). Subtracting that gives exact
# number as the two queries below this one. Makes sense that it matches my "AND IS NOT NULL" one. but IDK why
# I'm getting the same number from the query whe I leave NULL out of it.
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM public.api_runs r
# LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id
# WHERE g.api_call_group_id != -1 OR g.api_call_group_id IS NULL;""") # 13,814
# - not adding NULL clause filter. somehow I got less records
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM public.api_runs r
# LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id
# WHERE g.api_call_group_id != -1;""") # 13,210
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM public.api_runs r
# LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id
# WHERE g.api_call_group_id != -1 AND g.api_call_group_id IS NOT NULL;""") # 13,210

# --- haven't tried
# inner join + prefilter distinct call groups
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM (
# SELECT *
# FROM public.apiruns_grouped
# WHERE api_call_group_id IN (SELECT DISTINCT api_call_group_id FROM public.apiruns_grouped)
# ) AS g
# INNER JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 529,628

# --- Failed
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM public.apiruns_grouped g
# INNER JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 529,628
# flip and left join
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM public.api_runs r
# LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id""") # 530,232
# prefilter distinct call groups
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM (
# SELECT * FROM public.apiruns_grouped
# WHERE api_call_group_id IN (SELECT DISTINCT api_call_group_id FROM public.apiruns_grouped)
# ) as g
# RIGHT JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 530,232
# w/ LIMIT on inner query
# - also got 530,232 records without the array_sort()
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM (
# SELECT * FROM public.apiruns_grouped LIMIT 10
# ) as g
# RIGHT JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 28k
# no JOIN
# data: List[RowMapping] = sql_query(con, """
# SELECT DISTINCT array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time
# FROM public.apiruns_grouped g""") # 3288
print()

""")
data: List[Dict] = [dict(x) for x in data]
if verbose:
print(f'usage_query(): Fetched {len(data)} records in n seconds: {(datetime.now() - t0).seconds}')
Expand Down
Loading

0 comments on commit 59e9dad

Please sign in to comment.