diff --git a/backend/routes/db.py b/backend/routes/db.py index fc5714a22..4c50342b2 100644 --- a/backend/routes/db.py +++ b/backend/routes/db.py @@ -726,22 +726,13 @@ def next_api_call_group_id() -> int: return id +# todo: can / should we replace this query with selecting from `apijoin` table instead? def usage_query(verbose=True) -> List[Dict]: """Query for usage data Filters out problematic api_call_group_id where the call group is amibiguous (-1 or NULL)""" t0 = datetime.now() with get_db_connection() as con: - # what we had originally - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM public.apiruns_grouped g - # RIGHT JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 530,232 - # todo: siggie says this same as inner join? how/why? does that matter? - # answer: a left join includes all rows of the left table regardless of whether they - # match the right table. this leaves all columns from the right table null. - # but if you require any column from the right table to be not null, then - # it's the same as an inner join. -- whether it matters here, I don't know. data: List[RowMapping] = sql_query(con, """ SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time, date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week, @@ -750,75 +741,7 @@ def usage_query(verbose=True) -> List[Dict]: LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id WHERE g.api_call_group_id != -1 AND g.api_call_group_id IS NOT NULL; -- WHERE g.api_call_group_id = -1 or g.api_call_group_id IS NULL; - """) # 13,210 - - # todo: temp: cleanup at end of paper - # --- good? - # 'filter api_call_group = -1' - multiple variations - # - chaning 'IS NULL' to 'IS NOT NULL" turned query from a few seconds into 70 seconds before i canceled - # - as i suspected (the AI was wrong), there *are* NULL records in here (604). Subtracting that gives exact - # number as the two queries below this one. Makes sense that it matches my "AND IS NOT NULL" one. but IDK why - # I'm getting the same number from the query whe I leave NULL out of it. - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM public.api_runs r - # LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id - # WHERE g.api_call_group_id != -1 OR g.api_call_group_id IS NULL;""") # 13,814 - # - not adding NULL clause filter. somehow I got less records - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM public.api_runs r - # LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id - # WHERE g.api_call_group_id != -1;""") # 13,210 - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM public.api_runs r - # LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id - # WHERE g.api_call_group_id != -1 AND g.api_call_group_id IS NOT NULL;""") # 13,210 - - # --- haven't tried - # inner join + prefilter distinct call groups - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM ( - # SELECT * - # FROM public.apiruns_grouped - # WHERE api_call_group_id IN (SELECT DISTINCT api_call_group_id FROM public.apiruns_grouped) - # ) AS g - # INNER JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 529,628 - - # --- Failed - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM public.apiruns_grouped g - # INNER JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 529,628 - # flip and left join - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM public.api_runs r - # LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id""") # 530,232 - # prefilter distinct call groups - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM ( - # SELECT * FROM public.apiruns_grouped - # WHERE api_call_group_id IN (SELECT DISTINCT api_call_group_id FROM public.apiruns_grouped) - # ) as g - # RIGHT JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 530,232 - # w/ LIMIT on inner query - # - also got 530,232 records without the array_sort() - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM ( - # SELECT * FROM public.apiruns_grouped LIMIT 10 - # ) as g - # RIGHT JOIN public.api_runs r ON g.api_call_group_id = r.api_call_group_id""") # 28k - # no JOIN - # data: List[RowMapping] = sql_query(con, """ - # SELECT DISTINCT array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time - # FROM public.apiruns_grouped g""") # 3288 - print() - + """) data: List[Dict] = [dict(x) for x in data] if verbose: print(f'usage_query(): Fetched {len(data)} records in n seconds: {(datetime.now() - t0).seconds}') diff --git a/misc/amia_paper_2024/stats.py b/misc/amia_paper_2024/stats.py index 824ab8599..9c3849e8d 100644 --- a/misc/amia_paper_2024/stats.py +++ b/misc/amia_paper_2024/stats.py @@ -5,7 +5,29 @@ Resources - GH issue: https://github.com/jhu-bids/TermHub/issues/631 -- Paper: https://livejohnshopkins-my.sharepoint.com/:w:/g/personal/sgold15_jh_edu/EXbuxwBsb0pPgFfgPYjR6qgBlJ1gmpqsMPYJ3vn6Y_HnYw?e=4%3ApEDbMe&fromShare=true&at=9&CID=d4663c40-1ebe-db59-8cf5-2bc091d45692""" +- Paper: https://livejohnshopkins-my.sharepoint.com/:w:/g/personal/sgold15_jh_edu/EXbuxwBsb0pPgFfgPYjR6qgBlJ1gmpqsMPYJ3vn6Y_HnYw?e=4%3ApEDbMe&fromShare=true&at=9&CID=d4663c40-1ebe-db59-8cf5-2bc091d45692 + +Other queries: +- Value set Size +with c as (select count(*) cnt from cset_members_items group by codeset_id) +select 10^round(log10(cnt)) || ' - ' || 10 * 10^round(log10(cnt)) - 1 value_set_size, count(*) value_sets +from c group by 1 order by 1 desc; +┌─────────────────┬────────────┐ +│ value_set_size │ value_sets │ +├─────────────────┼────────────┤ +│ 1 - 9 │ 774 │ +│ 10 - 99 │ 1862 │ +│ 100 - 999 │ 2596 │ +│ 1000 - 9999 │ 1357 │ +│ 10000 - 99999 │ 537 │ +│ 100000 - 999999 │ 64 │ +└─────────────────┴────────────┘ + +- IDK what this one is +SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,api_call_group_id +FROM public.apijoin +WHERE api_call = 'codeset-ids-by-concept-id' +""" import os import sys from datetime import datetime @@ -13,6 +35,7 @@ from pathlib import Path from typing import Callable, Dict, List, Union +import numpy as np import pandas as pd from sqlalchemy import RowMapping @@ -49,6 +72,12 @@ def setup(): OUTDIR.mkdir() +def filename_format(filename: str) -> str: + """Format filename""" + return filename.replace(' ', '_').lower().replace('(', '').replace(')', '')\ + .replace(',', '').replace(' / ', ' ').replace('/', '').replace(' ', ' ') + + def api_runs_query(verbose=False): """Get all records from api_runs table""" t0 = datetime.now() @@ -64,6 +93,110 @@ def api_runs_query(verbose=False): return data +def histogram_query_concepts_in_calls(filter_nulls=True) -> pd.DataFrame: + """Concepts in calls + + result, modified (NULLs removed, at the very least) + ┌─────────────────────┬───────┐ + │ concepts_in_call │ calls │ + ├─────────────────────┼───────┤ + │ 1 - 9 │ 20 │ + │ 10 - 99 │ 1,642 │ + │ 100 - 999 │ 855 │ + │ 1000 - 9999 │ 505 │ + │ 10000 - 99999 │ 153 │ + │ 100000 - 999999 │ 121 │ + │ 1000000 - 9999999 │ 3 │ + │ 10000000 - 99999999 │ 2 │ + └─────────────────────┴───────┘ + """ + # Query + qry = """ + WITH c1 AS ( + SELECT replace(result, ' rows', '') cnt FROM public.apijoin + WHERE api_call IN ('codeset-ids-by-concept-id', 'concept', 'get-cset-members-items') + ), c2 AS ( + SELECT CASE WHEN cnt ~ '^[0-9]+$' THEN cnt::integer END AS cnt FROM c1 + ) + SELECT 10^round(log10(cnt)) || ' - ' || 10 * 10^round(log10(cnt)) - 1 concepts_in_call, count(*) calls + FROM c2 GROUP BY 1 ORDER BY 1 DESC;""" + with get_db_connection() as con: + data: List[RowMapping] = sql_query(con, qry) + data: List[Dict] = [dict(d) for d in data] + df = pd.DataFrame(data) + # Filter + if filter_nulls: + df = df[df['concepts_in_call'].notna()] + # Format + def relabel(val): + """Relabel column vals: log10""" + min_val = val.split(' - ')[0] + n = min_val.count('0') + return f'10^{n}-10^{n+1}' + df['concepts_in_call'] = df['concepts_in_call'].apply(relabel) + return df + + +# todo: y axis also in log scale +def plot_concepts_in_calls(log_scale=True): + """Plot concepts in calls""" + # Vars + df: pd.DataFrame = histogram_query_concepts_in_calls() + xdata = df['concepts_in_call'].tolist() + ydata = df['calls'] + title = f'n concepts in API calls{" (log scale)" if log_scale else ""}' + xlab = 'Concepts in call' + ylab = 'Number of calls' + filename = filename_format(title) + + # Big + plt.figure(figsize=(10, 6)) + # plt.bar(xdata, ydata, color='skyblue', log=log_scale) # same + plt.bar(xdata, ydata, log=log_scale) # same + plt.xlabel(xlab) + plt.ylabel(ylab) + plt.title(title) # same + plt.xticks(rotation=45) + plt.tight_layout() + plt.savefig(OUTDIR / f'{filename}.png', dpi=500) + plt.clf() + + # Small + # noinspection DuplicatedCode todo + plt.figure(figsize=(3, 1)) + # plt.bar(xdata, ydata, color='skyblue', log=log_scale) # same + plt.bar(xdata, ydata, log=log_scale) # same + plt.title(title) # same + plt.rcParams["font.size"] = "5" + plt.tick_params(axis='y', which='both', left=False, labelleft=False) # Turn off y-axis ticks and labels + plt.xticks(rotation=90) + plt.savefig(OUTDIR / f'{filename} - small.png', bbox_inches='tight', dpi=100) + plt.clf() + + # Small 2 + # - same except no rotation and diff xdata + xdata = ['<' + x.split('-')[1] for x in xdata] + # noinspection DuplicatedCode todo + plt.figure(figsize=(3, 1)) + # plt.bar(xdata, ydata, color='skyblue', log=log_scale) # same + plt.bar(xdata, ydata, log=log_scale) # same + plt.title(title) # same + plt.rcParams["font.size"] = "5" + plt.tick_params(axis='y', which='both', left=False, labelleft=False) # Turn off y-axis ticks and labels + # plt.xticks(rotation=90) + plt.savefig(OUTDIR / f'{filename} - small2.png', bbox_inches='tight', dpi=100) + plt.clf() + + +def get_concept_counts(row: pd.Series) -> int: + """Get concept counts""" + null_token = 0 # I'd rather return NULL but for expedition, sticking w/ this for now + if row['api_call'] not in ('codeset-ids-by-concept-id', 'concept', 'get-cset-members-items'): + return null_token + x = row['result'] + return int(x.split(' ')[0]) if x and x.endswith(' rows') else null_token + + def get_dataset_with_mods(func: Callable, path: Union[str, Path], use_cache=False, verbose=True) -> pd.DataFrame: """Get data to be analyzed: JOIN of api_runs and apiruns_grouped @@ -89,7 +222,19 @@ def get_dataset_with_mods(func: Callable, path: Union[str, Path], use_cache=Fals if verbose: print(f'Read {len(df)} records from {path} in n seconds: {(datetime.now() - t0).seconds}') # Mods + # - client ip df['client_ip'] = df['client'].apply(lambda x: x.split(':')[0]) + # - duration_seconds_float && session duration / concept count + if 'duration_seconds' in df.columns: + # - duration_seconds_float + df['duration_seconds_float'] = df['duration_seconds'].apply(lambda x: x.total_seconds()) + # - session duration / concept count + df['cnt'] = df.apply(get_concept_counts, axis=1) + df['cnt_tot'] = df.groupby('api_call_group_id')['cnt'].transform('sum') + df['duration_sec_per_concept'] = df['duration_seconds_float'] / df['cnt_tot'] + # duration_sec_per_1k_concepts: didn't prove useful. visually the same anyway + # df['duration_sec_per_1k_concepts'] = df['duration_seconds_float'] / (df['cnt_tot'] / 1000) + # Filtering df = preprocess_null_call_groups(df, verbose) # Formatting @@ -192,60 +337,180 @@ def summary_stats( return df -def plot_render(title: str, xlab: str, ylab: str, small=False): - """Close out plot""" - filename = title.replace(' ', '_').lower() - # todo: put in else? - plt.xlabel(xlab) - plt.ylabel(ylab) +def plot_render( + data: List[Union[float, int]] , title: str, xlab: str, ylab: str, bins = None, small=False, is_timeseries=False, + log_scale=False, xtick_fix=False, xtick_step=None +): + """Close out plot + + :param: tick_fix: Turned off by default because increases overall exec time from 5 to 28 seconds""" + title = title + f'{" (log scale)" if log_scale else ""}' + # Initialize plot + if small: + plt.figure(figsize=(3, 1)) # fig = plt.figure(figsize=(1, 0.5)) + if bins: + plt.hist(data, bins=bins, edgecolor='black', alpha=0.7, log=log_scale) + else: + plt.hist(data, edgecolor='black', alpha=0.7, log=log_scale) + # Name + filename = filename_format(title) + # Title + # Valid font size are xx-small, x-small, small, medium, large, x-large, xx-large, larger, smaller, None + # plt.title(title, fontsize='x-small' if small else 'medium') # medium is the default + plt.title(title) + plt.grid(True) + # X Ticks: center of bars rather than to the left + # todo: why ticks & labels getting super bolded with this fix? + if not is_timeseries and xtick_fix: + plt.xticks([edge + 0.5 for edge in data[:-1]], data[:-1]) + if xtick_step: + plt.xticks(range(0, int(max(data)), xtick_step), rotation=45) + if small: - plt.title(title, fontsize='smaller') - plt.axis('off') + # Font size + # todo: this isn't working; not a big priority + plt.rcParams["font.size"] = "6" if len(title) < 30 else "6" # long titles make plot smaller + # Axis + if not is_timeseries: # it makes sense to get rid of axis for timeseries, but not otherwise + plt.tick_params(axis='y', which='both', left=False, labelleft=False) # Turn off y-axis ticks and labels + else: + plt.axis('off') + # Save # plt.savefig(OUTDIR / f'{filename}.png', dpi=100) plt.savefig(OUTDIR / f'{filename} - small.png', bbox_inches='tight', dpi=100) else: - plt.title(title) - plt.savefig(OUTDIR / f'{filename}.png') + # Labels + plt.xlabel(xlab) + plt.ylabel(ylab) + # Save + plt.savefig(OUTDIR / f'{filename}.png', dpi=100) plt.clf() -# todo: histogram sections duplicative. could be refactored to be more DRY -def plots(df: pd.DataFrame, df_dev0: pd.DataFrame, small=False): +# todo: could be DRYer +# todo: if dev data useful: consider dev + non-dev in same plot +def plots(df: pd.DataFrame, df_dev0: pd.DataFrame, small=False, dev_data_plots=False): """Plots""" - # todo: consider dev + non-dev in same plot # Histogram: Value set n selections # - Size distribution histogram. How many code sets in initial call of log session - for df, name_suffix in ((df, 'Dev data included'), (df_dev0, '')): + datasets = ((df, 'Dev data included'), (df_dev0, '')) if dev_data_plots else ((df_dev0, ''),) + for df_i, name_suffix in datasets: # Title title = f'Code set comparison size{f" - {name_suffix}" if name_suffix else ""}' # Select data - df_i = df[df['api_call'] == 'get-csets'] - df_i['len_codeset_ids'] = df_i['codeset_ids'].apply(lambda x: len(x)) - data = list(df_i['len_codeset_ids']) + df_i2 = df_i[df_i['api_call'] == 'get-csets'] + df_i2['len_codeset_ids'] = df_i['codeset_ids'].apply(lambda x: len(x)) + data: List[int] = list(df_i2['len_codeset_ids']) + bins = range(min(data), max(data) + 2) # Render - if small: - plt.figure(figsize=(3, 1)) # fig = plt.figure(figsize=(1, 0.5)) - plt.hist(data, bins=range(min(data), max(data) + 1), edgecolor='black', alpha=0.7) - plot_render(title, 'Number of code sets being compared', 'Frequency', small) + plot_render(data, title, 'Number of code sets being compared', 'Frequency', bins, small) - for df, name_suffix in ((df, 'Dev data included'), (df_dev0, '')): + # Histogram: API calls per week + for df_i, name_suffix in datasets: # Title title = f'API calls per week{f" - {name_suffix}" if name_suffix else ""}' # Select data # for df, name_suffix in ((df_apiruns, 'Dev data included'), (df_apiruns_dev0, '')): - data = list(df['week']) + data: List = list(df_i['week']) + bins = len(set(data)) # Render - if small: - plt.figure(figsize=(3, 1)) # fig = plt.figure(figsize=(1, 0.5)) - plt.hist(data, bins=len(set(data)), edgecolor='black', alpha=0.7) - plot_render(title, 'Week', 'API calls', small) - - -def run(use_cache=False, verbose=False): + plot_render(data, title, 'Week', 'API calls', bins, small, is_timeseries=True) + + # Histogram: Duration, session API calls + # df_i = df # dev data ok for this one, but for some reason not rendering + df_i = df_dev0 + # Title + title = f'Duration, session API calls' + # Select data + df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first') + data: List[float] = df_i2['duration_seconds_float'].tolist() + # noinspection PyTypeChecker + data = [x for x in data if x] # filter null ''s + # data = [round(x, 1) for x in data] # lower decimal precision didn't make any diff + # - int or deca seconds also didn't make a difference + bins = 50 + # Render + plot_render( + data, title, 'n seconds', 'n sessions', bins, small, log_scale=True, xtick_step=10) + + # Histogram: Session duration divided by concept count + # todo: alternate view where I have n concepts on y axis, and n seconds on x + # df_i = df # dev data should've been ok, but were many outliers + df_i = df_dev0 + # Title + title = f'Session API call duration / concept count' + # Select data + df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first') # df_dev0: len 1550 --> 542 + df_i2 = df_i2[~df_i2['duration_sec_per_concept'].isin([np.inf, -np.inf, ''])] # idk how '' snuck in + data: List[float] = df_i2['duration_sec_per_concept'].tolist() + # noinspection PyTypeChecker + data = [x for x in data if x] # filter null ''s + bins = 30 + # Render + plot_render( + data, title, 'n seconds / concept', 'instances', bins, small, log_scale=True) + + # Histogram: User queries - calls + # todo #1: DRY up w/ other user queries section(s) + # - remove '# noinspection DuplicatedCode' after + df_i = df_dev0 + # Select data + # noinspection DuplicatedCode + ip_counts = {} + ips = df_i['client_ip'].to_list() + for ip in ips: + ip_counts[ip] = ip_counts.get(ip, 0) + 1 + data = list(ip_counts.values()) + # Variation: binning + bins = int(max(data) / 3) + title = f'N queries made by users' + plot_render( + data, title, 'n queries', 'n users', bins, small, log_scale=True, xtick_step=10) + # Variation: no binning + bins = max(data) + title = f'N queries made by users - No bins' + plot_render( + data, title, 'n queries', 'n users', bins, small, log_scale=True, xtick_step=10) + + # Histogram: User queries - sessions + df_i = df_dev0 + df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first') # df_dev0: len 1550 --> 542 + # Select data + # noinspection DuplicatedCode + ip_counts = {} + ips = df_i2['client_ip'].to_list() + for ip in ips: + ip_counts[ip] = ip_counts.get(ip, 0) + 1 + data = list(ip_counts.values()) + # Plot + bins = max(data) + title = f'N user sessions' + plot_render( + data, title, 'n sessions', 'n users', bins, small, log_scale=True, xtick_step=2) + + # Histogram: User queries - days + df_i = df_dev0 + df_i2 = df_i.drop_duplicates(subset=['client_ip', 'date'], keep='first') + # Select data + # noinspection DuplicatedCode + ip_counts = {} + ips = df_i2['client_ip'].to_list() + for ip in ips: + ip_counts[ip] = ip_counts.get(ip, 0) + 1 + data = list(ip_counts.values()) + # Plot + bins = max(data) + title = f'User active days' + plot_render( + data, title, 'n distinct days active', 'n users', bins, small, log_scale=True, xtick_step=1) + + +def run(use_cache=False, verbose=False, dev_data_plots=False): """Run analysis :param use_cache: If True, will use most recent local CSV instead of calling the database""" # Initial setup --- + t0 = datetime.now() setup() df_apiruns: pd.DataFrame = get_dataset_with_mods(api_runs_query, USAGE_UNJOINED_CSV_PATH, use_cache, verbose) df_w_groups_filtered: pd.DataFrame = get_dataset_with_mods(usage_query, USAGE_JOINED_CSV_PATH, use_cache, verbose) @@ -264,10 +529,17 @@ def run(use_cache=False, verbose=False): df_out.to_csv(OUT_CSV_PATH_ALL, index=False) # Plots --- + # - From primary datasets # todo: would be better to combine dev0/dev1 and small(T/F) here and then make 1 call to plot() for each combo # - would need refactor 'for df, name_suffix in' out of plot() - plots(df_w_groups_filtered, df_w_groups_filtered_dev0, small=False) - plots(df_w_groups_filtered, df_w_groups_filtered_dev0, small=True) + plots(df_w_groups_filtered, df_w_groups_filtered_dev0, False, dev_data_plots) # Big + plots(df_w_groups_filtered, df_w_groups_filtered_dev0, True, dev_data_plots) # Small + if verbose: + print(f'Finished stats report in n seconds: {(datetime.now() - t0).seconds}') + + # - From custom queries + for log_scale in (True, False): + plot_concepts_in_calls(log_scale) if __name__ == '__main__':