Skip to content

Commit

Permalink
- Changed analysis tables a little more -- now we have apiruns_plus
Browse files Browse the repository at this point in the history
  which includes the dev data, and apijoin, which joins in the group
  data and excludes dev data
- db.py:usage_query now just does select * from apijoin
- stats.py:api_runs_query now just does select * from apiruns_plus
- changed stats.py to just use those two tables
  • Loading branch information
Sigfried committed Mar 18, 2024
1 parent 59e9dad commit a7bca44
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 72 deletions.
54 changes: 20 additions & 34 deletions backend/db/ddl-18-apirun_groups.jinja.sql
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,22 @@ WITH RankedGroups AS (
SELECT
*,
substring(client from '\d+\.\d+\.\d+') AS ip3, -- sometimes fourth ip part differs for same call session
ROW_NUMBER() OVER (PARTITION BY api_call_group_id ORDER BY timestamp::timestamp DESC) AS rn,
ROW_NUMBER() OVER (
PARTITION BY host, substring(client from '\d+\.\d+\.\d+'), api_call_group_id
ORDER BY timestamp::timestamp DESC) AS rn,
EXTRACT(SECOND FROM timestamp::timestamp - (
lag(timestamp::timestamp) OVER (PARTITION by host, substring(client from '\d+\.\d+\.\d+'), api_call_group_id ORDER BY timestamp) +
lag(process_seconds) OVER (PARTITION by host, substring(client from '\d+\.\d+\.\d+'), api_call_group_id ORDER BY timestamp)
* INTERVAL '1 second')) AS call_gap,
date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
timestamp::date date,
CASE
WHEN timestamp::timestamp - LAG(timestamp::timestamp) OVER
(PARTITION BY host, substring(client from '\d+\.\d+\.\d+'), api_call_group_id ORDER BY timestamp::timestamp) > INTERVAL '1 second' THEN 1
ELSE 0
END AS new_group_flag

FROM public.api_runs
WHERE host IN ('prod', 'dev')
AND client NOT LIKE '216.164.48.98%'
AND client NOT LIKE '174.99.54.40%'
AND client NOT LIKE '136.226%' -- this one can differ by the third and fourth ip parts; just skipping it.
),
groups_broke_by_long_gap AS (
SELECT *,
Expand All @@ -52,7 +52,10 @@ groups_broke_by_long_gap AS (
ORDER BY timestamp::timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS group_id
FROM RankedGroups
)
SELECT * INTO public.apiruns_plus FROM groups_broke_by_long_gap;
SELECT *,
ROW_NUMBER() OVER(PARTITION BY group_id ORDER BY timestamp) AS rownum,
COUNT(*) OVER(PARTITION BY group_id) AS grouprows
INTO public.apiruns_plus FROM groups_broke_by_long_gap;

CREATE INDEX aprpidx ON apiruns_plus (group_id );

Expand All @@ -64,46 +67,29 @@ SELECT
codeset_ids,
params,
*/
ARRAY_AGG(api_call) AS api_calls,
ARRAY_SORT(ARRAY_AGG(api_call)) AS api_calls,
MIN(timestamp::timestamp) as group_start_time,
MAX(timestamp::timestamp) as group_end_time,
(MAX(timestamp::timestamp) - MIN(timestamp::timestamp)) +
CASE WHEN MAX(rn) = 1 THEN MAX(process_seconds) * INTERVAL '1 second' ELSE INTERVAL '0 seconds' END as duration_seconds
CASE WHEN MAX(rn) = 1 THEN MAX(process_seconds) * INTERVAL '1 second'
ELSE INTERVAL '0 seconds' END as duration_seconds
INTO public.apiruns_grouped
FROM
public.apiruns_plus
GROUP BY
group_id /* , api_call, codeset_ids, params*/
FROM public.apiruns_plus
GROUP BY group_id /* , api_call, codeset_ids, params*/
ORDER BY group_id desc; -- group_start_time DESC, ip3

CREATE INDEX aprgidx ON apiruns_grouped(group_id);

DROP TABLE IF EXISTS public.apijoin CASCADE;

SELECT DISTINCT r.*,
array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time,
date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
timestamp::date date,
FORMAT('%s of %s',
ROW_NUMBER() OVER(PARTITION BY r.group_id ORDER BY timestamp),
COUNT(*) OVER(PARTITION BY r.group_id)) AS callnum
SELECT DISTINCT r.*, g.api_calls, g.duration_seconds, g.group_start_time
INTO public.apijoin
FROM public.apiruns_plus r
LEFT JOIN public.apiruns_grouped g ON g.group_id = r.group_id
WHERE r.host IN ('prod', 'dev')
AND r.client NOT LIKE '216.164.48.98%'
AND r.client NOT LIKE '174.99.54.40%'
AND r.client NOT LIKE '136.226%' -- this one can differ by the third and fourth ip parts; just skipping it.
;

CREATE INDEX aprjidx ON public.apijoin(group_id);

/*
WITH RankedRows AS (
SELECT host,client,api_call,result,week,codeset_ids, params, callnum, api_call_group_id,
ROW_NUMBER() OVER(PARTITION BY column_to_partition_by ORDER BY column_to_order_by) AS rn
FROM public.apijoin
WHERE api_call = 'concept-ids-by-codeset-id'
ORDER BY 9,8
)
SELECT *
FROM RankedRows
WHERE rn = 1;
--WHERE r.api_call_group_id = -1 OR g.api_call_group_id IS NULL
*/
CREATE INDEX aprjidx ON public.apijoin(group_id);
18 changes: 9 additions & 9 deletions backend/routes/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,15 +733,15 @@ def usage_query(verbose=True) -> List[Dict]:
Filters out problematic api_call_group_id where the call group is amibiguous (-1 or NULL)"""
t0 = datetime.now()
with get_db_connection() as con:
data: List[RowMapping] = sql_query(con, """
SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time,
date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
timestamp::date date
FROM public.api_runs r
LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id
WHERE g.api_call_group_id != -1 AND g.api_call_group_id IS NOT NULL;
-- WHERE g.api_call_group_id = -1 or g.api_call_group_id IS NULL;
""")
data: List[RowMapping] = sql_query(con, """SELECT * FROM public.apijoin""")
# SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time,
# date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
# timestamp::date date
# FROM public.api_runs r
# LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id
# WHERE g.api_call_group_id != -1 AND g.api_call_group_id IS NOT NULL;
# -- WHERE g.api_call_group_id = -1 or g.api_call_group_id IS NULL;
# """)
data: List[Dict] = [dict(x) for x in data]
if verbose:
print(f'usage_query(): Fetched {len(data)} records in n seconds: {(datetime.now() - t0).seconds}')
Expand Down
60 changes: 31 additions & 29 deletions misc/amia_paper_2024/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
└─────────────────┴────────────┘
- IDK what this one is
SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,api_call_group_id
SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,group_id
FROM public.apijoin
WHERE api_call = 'codeset-ids-by-concept-id'
"""
Expand Down Expand Up @@ -67,7 +67,8 @@ def setup():
pd.options.mode.chained_assignment = None # default='warn'

if not INDIR.exists():
OUTDIR.mkdir()
# OUTDIR.mkdir() # typo?
INDIR.mkdir()
if not OUTDIR.exists():
OUTDIR.mkdir()

Expand All @@ -83,10 +84,10 @@ def api_runs_query(verbose=False):
t0 = datetime.now()
with get_db_connection() as con:
data: List[RowMapping] = sql_query(
con, """
SELECT DISTINCT *,
date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
timestamp::date date FROM public.api_runs r""")
con, """SELECT * FROM public.apiruns_plus""")
# SELECT DISTINCT *,
# date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
# timestamp::date date FROM public.api_runs r""")
data: List[Dict] = [dict(d) for d in data]
if verbose:
print(f'api_runs_query(): Fetched {len(data)} records in n seconds: {(datetime.now() - t0).seconds}')
Expand Down Expand Up @@ -230,13 +231,13 @@ def get_dataset_with_mods(func: Callable, path: Union[str, Path], use_cache=Fals
df['duration_seconds_float'] = df['duration_seconds'].apply(lambda x: x.total_seconds())
# - session duration / concept count
df['cnt'] = df.apply(get_concept_counts, axis=1)
df['cnt_tot'] = df.groupby('api_call_group_id')['cnt'].transform('sum')
df['cnt_tot'] = df.groupby('group_id')['cnt'].transform('sum')
df['duration_sec_per_concept'] = df['duration_seconds_float'] / df['cnt_tot']
# duration_sec_per_1k_concepts: didn't prove useful. visually the same anyway
# df['duration_sec_per_1k_concepts'] = df['duration_seconds_float'] / (df['cnt_tot'] / 1000)

# Filtering
df = preprocess_null_call_groups(df, verbose)
df = preprocess_null_call_groups(df, verbose) # shouldn't do anything
# Formatting
df = df.fillna('') # fixes problem w/ .str.contains() ops
return df
Expand Down Expand Up @@ -267,11 +268,11 @@ def filter_dev_data(df: pd.DataFrame, verbose=True) -> pd.DataFrame:
df2['codeset_ids_str'] = df2['codeset_ids'].astype(str)
for case in test_cases:
df_i = df2[(df2['api_call'] == 'get-csets') & (df2['codeset_ids_str'] == case)]
sessions.update(set(df_i['api_call_group_id']))
sessions.update(set(df_i['group_id']))
# -1 is an erroneous api_call_group_id linked to otherwise contextually valid records
if float(-1) in sessions:
sessions.remove(float(-1))
df2 = df2[~df2['api_call_group_id'].isin(sessions)]
df2 = df2[~df2['group_id'].isin(sessions)]
if verbose:
print(f'Filtered out n records created by test cases: ', len_before - len(df2))
return df2
Expand All @@ -282,9 +283,9 @@ def preprocess_null_call_groups(df: pd.DataFrame, verbose=True) -> pd.DataFrame:
These are cases from before we added this feature, or where we called the backend directly.
As of 2024/03/14 this doesn't have an effect. No NULL groups."""
df2 = df[~df['api_call_group_id'].isna()]
df2 = df[~df['group_id'].isna()]
if verbose:
print(f'Filtered out n records based on null api_call_group_id: ', len(df) - len(df2))
print(f'Filtered out n records based on null group_id: ', len(df) - len(df2))
return df2


Expand All @@ -302,10 +303,10 @@ def summary_stats(
summary = {}
summary['Total log records'] = len(df_apiruns)
summary['Log records with session id'] = len(df_w_groups_filtered)
summary['Log sessions'] = len(df_w_groups_filtered['api_call_group_id'].unique())
summary['Log sessions'] = len(df_w_groups_filtered['group_id'].unique())
summary['IP addresses'] = len(df_apiruns['client_ip'].unique())
summary['Sessions with errors'] = df_w_groups_filtered[df_w_groups_filtered['result'].str.lower().str.contains('error')][
'api_call_group_id'].nunique()
'group_id'].nunique()
summary['All API call errors'] = len(df_apiruns[df_apiruns['result'].str.lower().str.contains('error')])

# Value set combos
Expand All @@ -316,14 +317,14 @@ def summary_stats(
# - analyze uniqueness of get-csets calls within an API call group session
# diff_get_csets_vs_groups=35; i feel like this should be equal to some calc involving next 4 vars, but not sure
# noinspection PyUnusedLocal
diff_get_csets_vs_groups = len(df_get_csets) - len(df_get_csets['api_call_group_id'].unique()) # 35
diff_get_csets_vs_groups = len(df_get_csets) - len(df_get_csets['group_id'].unique()) # 35
diff_codeset_ids_in_group__n_instances = 0 # 1
diff_codeset_ids_in_group__n_calls = 0 # 2
mult_get_cset_in_session__n_instances = 0 # 30
mult_get_cset_in_session__n_calls = 0 # 64
# todo: maybe convert to str in pre-processing instead, if need 'str' more than temporarily
df_get_csets['codeset_ids'] = df_get_csets['codeset_ids'].astype(str)
for group_id, group_data in df_get_csets.groupby('api_call_group_id'):
for group_id, group_data in df_get_csets.groupby('group_id'):
if len(group_data) > 1 and len(group_data['codeset_ids'].unique()) > 1:
diff_codeset_ids_in_group__n_instances += 1
diff_codeset_ids_in_group__n_calls += len(group_data)
Expand Down Expand Up @@ -422,7 +423,7 @@ def plots(df: pd.DataFrame, df_dev0: pd.DataFrame, small=False, dev_data_plots=F
# Title
title = f'Duration, session API calls'
# Select data
df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first')
df_i2 = df_i.drop_duplicates(subset='group_id', keep='first')
data: List[float] = df_i2['duration_seconds_float'].tolist()
# noinspection PyTypeChecker
data = [x for x in data if x] # filter null ''s
Expand All @@ -440,7 +441,7 @@ def plots(df: pd.DataFrame, df_dev0: pd.DataFrame, small=False, dev_data_plots=F
# Title
title = f'Session API call duration / concept count'
# Select data
df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first') # df_dev0: len 1550 --> 542
df_i2 = df_i.drop_duplicates(subset='group_id', keep='first') # df_dev0: len 1550 --> 542
df_i2 = df_i2[~df_i2['duration_sec_per_concept'].isin([np.inf, -np.inf, ''])] # idk how '' snuck in
data: List[float] = df_i2['duration_sec_per_concept'].tolist()
# noinspection PyTypeChecker
Expand Down Expand Up @@ -474,7 +475,7 @@ def plots(df: pd.DataFrame, df_dev0: pd.DataFrame, small=False, dev_data_plots=F

# Histogram: User queries - sessions
df_i = df_dev0
df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first') # df_dev0: len 1550 --> 542
df_i2 = df_i.drop_duplicates(subset='group_id', keep='first') # df_dev0: len 1550 --> 542
# Select data
# noinspection DuplicatedCode
ip_counts = {}
Expand Down Expand Up @@ -512,28 +513,29 @@ def run(use_cache=False, verbose=False, dev_data_plots=False):
# Initial setup ---
t0 = datetime.now()
setup()
df_apiruns: pd.DataFrame = get_dataset_with_mods(api_runs_query, USAGE_UNJOINED_CSV_PATH, use_cache, verbose)
df_w_groups_filtered: pd.DataFrame = get_dataset_with_mods(usage_query, USAGE_JOINED_CSV_PATH, use_cache, verbose)
df_apiruns_dev0: pd.DataFrame = filter_dev_data(df_apiruns, verbose)
df_w_groups_filtered_dev0: pd.DataFrame = filter_dev_data(df_w_groups_filtered, verbose)
df_apiruns_dev1: pd.DataFrame = get_dataset_with_mods(api_runs_query, USAGE_UNJOINED_CSV_PATH, use_cache, verbose)
df_dev0: pd.DataFrame = get_dataset_with_mods(usage_query, USAGE_JOINED_CSV_PATH, use_cache, verbose)
# df_apiruns_dev0: pd.DataFrame = filter_dev_data(df_apiruns_dev1, verbose)
# df_w_groups_filtered_dev0: pd.DataFrame = filter_dev_data(df_w_groups_filtered, verbose)

df_out: pd.DataFrame = summary_stats(df_apiruns_dev1, df_dev0)
# Table ---
# Stats: With dev IPs included
df_out_dev1: pd.DataFrame = summary_stats(df_apiruns, df_w_groups_filtered)
# df_out_dev1: pd.DataFrame = summary_stats(df_apiruns_dev1, df_w_groups_filtered)

# Stats: With dev IPs filtered out
df_out_dev0: pd.DataFrame = summary_stats(df_apiruns_dev0, df_w_groups_filtered_dev0)
# df_out_dev0: pd.DataFrame = summary_stats(df_apiruns_dev0, df_w_groups_filtered_dev0)

# Join different output datasets
df_out = df_out_dev1.merge(df_out_dev0.rename(columns={'Value': 'Value_no_dev'}), on='Measure', how='outer')
# df_out = df_out_dev1.merge(df_out_dev0.rename(columns={'Value': 'Value_no_dev'}), on='Measure', how='outer')
df_out.to_csv(OUT_CSV_PATH_ALL, index=False)

# Plots ---
# - From primary datasets
# todo: would be better to combine dev0/dev1 and small(T/F) here and then make 1 call to plot() for each combo
# - would need refactor 'for df, name_suffix in' out of plot()
plots(df_w_groups_filtered, df_w_groups_filtered_dev0, False, dev_data_plots) # Big
plots(df_w_groups_filtered, df_w_groups_filtered_dev0, True, dev_data_plots) # Small
plots(df_apiruns_dev1, df_dev0, False, dev_data_plots) # Big
plots(df_apiruns_dev1, df_dev0, True, dev_data_plots) # Small
if verbose:
print(f'Finished stats report in n seconds: {(datetime.now() - t0).seconds}')

Expand Down Expand Up @@ -563,7 +565,7 @@ def run(use_cache=False, verbose=False, dev_data_plots=False):
└─────────────────┴────────────┘
SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,api_call_group_id
SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,group_id
FROM public.apijoin
WHERE api_call = 'codeset-ids-by-concept-id'
Expand Down

0 comments on commit a7bca44

Please sign in to comment.