- Changed analysis tables a little more -- now we have apiruns_plus

which includes the dev data, and apijoin, which joins in the group data and excludes dev data - db.py:usage_query now just does select * from apijoin - stats.py:api_runs_query now just does select * from apiruns_plus - changed stats.py to just use those two tables
jhu-bids · Mar 18, 2024 · a7bca44 · a7bca44
1 parent 59e9dad
commit a7bca44
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 72 deletions.
diff --git a/backend/db/ddl-18-apirun_groups.jinja.sql b/backend/db/ddl-18-apirun_groups.jinja.sql
@@ -28,22 +28,22 @@ WITH RankedGroups AS (
         SELECT
             *,
             substring(client from '\d+\.\d+\.\d+') AS ip3, -- sometimes fourth ip part differs for same call session
-            ROW_NUMBER() OVER (PARTITION BY api_call_group_id ORDER BY timestamp::timestamp DESC) AS rn,
+            ROW_NUMBER() OVER (
+                PARTITION BY host, substring(client from '\d+\.\d+\.\d+'), api_call_group_id
+                ORDER BY timestamp::timestamp DESC) AS rn,
             EXTRACT(SECOND FROM timestamp::timestamp - (
                 lag(timestamp::timestamp) OVER (PARTITION by host, substring(client from '\d+\.\d+\.\d+'), api_call_group_id ORDER BY timestamp) +
                     lag(process_seconds) OVER (PARTITION by host, substring(client from '\d+\.\d+\.\d+'), api_call_group_id ORDER BY timestamp)
                     * INTERVAL '1 second')) AS call_gap,
+            date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
+            timestamp::date date,
             CASE
               WHEN timestamp::timestamp - LAG(timestamp::timestamp) OVER
                   (PARTITION BY host, substring(client from '\d+\.\d+\.\d+'), api_call_group_id ORDER BY timestamp::timestamp) > INTERVAL '1 second' THEN 1
               ELSE 0
             END AS new_group_flag
 
         FROM public.api_runs
-        WHERE host IN ('prod', 'dev')
-          AND client NOT LIKE '216.164.48.98%'
-          AND client NOT LIKE '174.99.54.40%'
-          AND client NOT LIKE '136.226%' -- this one can differ by the third and fourth ip parts; just skipping it.
     ),
 groups_broke_by_long_gap AS (
   SELECT *,
@@ -52,7 +52,10 @@ groups_broke_by_long_gap AS (
             ORDER BY timestamp::timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS group_id
   FROM RankedGroups
 )
-SELECT * INTO public.apiruns_plus FROM groups_broke_by_long_gap;
+SELECT *,
+      ROW_NUMBER() OVER(PARTITION BY group_id ORDER BY timestamp) AS rownum,
+      COUNT(*) OVER(PARTITION BY group_id) AS grouprows
+INTO public.apiruns_plus FROM groups_broke_by_long_gap;
 
 CREATE INDEX aprpidx ON apiruns_plus (group_id );
 
@@ -64,46 +67,29 @@ SELECT
     codeset_ids,
     params,
     */
-    ARRAY_AGG(api_call) AS api_calls,
+    ARRAY_SORT(ARRAY_AGG(api_call)) AS api_calls,
     MIN(timestamp::timestamp) as group_start_time,
     MAX(timestamp::timestamp) as group_end_time,
     (MAX(timestamp::timestamp) - MIN(timestamp::timestamp)) +
-    CASE WHEN MAX(rn) = 1 THEN MAX(process_seconds) * INTERVAL '1 second' ELSE INTERVAL '0 seconds' END as duration_seconds
+        CASE WHEN MAX(rn) = 1 THEN MAX(process_seconds) * INTERVAL '1 second'
+             ELSE INTERVAL '0 seconds' END as duration_seconds
 INTO public.apiruns_grouped
-FROM
-    public.apiruns_plus
-GROUP BY
-    group_id /* , api_call, codeset_ids, params*/
+FROM public.apiruns_plus
+GROUP BY group_id /* , api_call, codeset_ids, params*/
 ORDER BY group_id desc; -- group_start_time DESC, ip3
 
 CREATE INDEX aprgidx ON apiruns_grouped(group_id);
 
 DROP TABLE IF EXISTS public.apijoin CASCADE;
 
-SELECT DISTINCT r.*,
-                array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time,
-                date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
-                timestamp::date date,
-                FORMAT('%s of %s',
-                       ROW_NUMBER() OVER(PARTITION BY r.group_id ORDER BY timestamp),
-                       COUNT(*) OVER(PARTITION BY r.group_id)) AS callnum
+SELECT DISTINCT r.*, g.api_calls, g.duration_seconds, g.group_start_time
 INTO public.apijoin
 FROM public.apiruns_plus r
 LEFT JOIN public.apiruns_grouped g ON g.group_id = r.group_id
+WHERE r.host IN ('prod', 'dev')
+  AND r.client NOT LIKE '216.164.48.98%'
+  AND r.client NOT LIKE '174.99.54.40%'
+  AND r.client NOT LIKE '136.226%' -- this one can differ by the third and fourth ip parts; just skipping it.
 ;
 
-CREATE INDEX aprjidx ON public.apijoin(group_id);
-
-/*
-WITH RankedRows AS (
-    SELECT host,client,api_call,result,week,codeset_ids, params, callnum, api_call_group_id,
-           ROW_NUMBER() OVER(PARTITION BY column_to_partition_by ORDER BY column_to_order_by) AS rn
-    FROM public.apijoin
-    WHERE api_call = 'concept-ids-by-codeset-id'
-    ORDER BY 9,8
-)
-SELECT *
-FROM RankedRows
-WHERE rn = 1;
---WHERE r.api_call_group_id = -1 OR g.api_call_group_id IS NULL
- */
+CREATE INDEX aprjidx ON public.apijoin(group_id);
diff --git a/backend/routes/db.py b/backend/routes/db.py
@@ -733,15 +733,15 @@ def usage_query(verbose=True) -> List[Dict]:
     Filters out problematic api_call_group_id where the call group is amibiguous (-1 or NULL)"""
     t0 = datetime.now()
     with get_db_connection() as con:
-        data: List[RowMapping] = sql_query(con, """
-            SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time,
-                date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
-                timestamp::date date
-            FROM public.api_runs r
-            LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id
-            WHERE g.api_call_group_id != -1 AND g.api_call_group_id IS NOT NULL;
-            -- WHERE g.api_call_group_id = -1 or g.api_call_group_id IS NULL;
-            """)
+        data: List[RowMapping] = sql_query(con, """SELECT * FROM public.apijoin""")
+            # SELECT DISTINCT r.*, array_sort(g.api_calls) api_calls, g.duration_seconds, g.group_start_time,
+            #     date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
+            #     timestamp::date date
+            # FROM public.api_runs r
+            # LEFT JOIN public.apiruns_grouped g ON g.api_call_group_id = r.api_call_group_id
+            # WHERE g.api_call_group_id != -1 AND g.api_call_group_id IS NOT NULL;
+            # -- WHERE g.api_call_group_id = -1 or g.api_call_group_id IS NULL;
+            # """)
     data: List[Dict] = [dict(x) for x in data]
     if verbose:
         print(f'usage_query(): Fetched {len(data)} records in n seconds: {(datetime.now() - t0).seconds}')

diff --git a/misc/amia_paper_2024/stats.py b/misc/amia_paper_2024/stats.py
@@ -24,7 +24,7 @@
 └─────────────────┴────────────┘
 
 - IDK what this one is
-SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,api_call_group_id
+SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,group_id
 FROM public.apijoin
 WHERE api_call = 'codeset-ids-by-concept-id'
 """
@@ -67,7 +67,8 @@ def setup():
     pd.options.mode.chained_assignment = None  # default='warn'
 
     if not INDIR.exists():
-        OUTDIR.mkdir()
+        # OUTDIR.mkdir()  # typo?
+        INDIR.mkdir()
     if not OUTDIR.exists():
         OUTDIR.mkdir()
 
@@ -83,10 +84,10 @@ def api_runs_query(verbose=False):
     t0 = datetime.now()
     with get_db_connection() as con:
         data: List[RowMapping] = sql_query(
-            con, """
-                SELECT DISTINCT *,
-                date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
-                timestamp::date date FROM public.api_runs r""")
+            con, """SELECT * FROM public.apiruns_plus""")
+                # SELECT DISTINCT *,
+                # date_bin('1 week', timestamp::TIMESTAMP, TIMESTAMP '2023-10-30')::date week,
+                # timestamp::date date FROM public.api_runs r""")
     data: List[Dict] = [dict(d) for d in data]
     if verbose:
         print(f'api_runs_query(): Fetched {len(data)} records in n seconds: {(datetime.now() - t0).seconds}')
@@ -230,13 +231,13 @@ def get_dataset_with_mods(func: Callable, path: Union[str, Path], use_cache=Fals
         df['duration_seconds_float'] = df['duration_seconds'].apply(lambda x: x.total_seconds())
         # - session duration / concept count
         df['cnt'] = df.apply(get_concept_counts, axis=1)
-        df['cnt_tot'] = df.groupby('api_call_group_id')['cnt'].transform('sum')
+        df['cnt_tot'] = df.groupby('group_id')['cnt'].transform('sum')
         df['duration_sec_per_concept'] = df['duration_seconds_float'] / df['cnt_tot']
         # duration_sec_per_1k_concepts: didn't prove useful. visually the same anyway
         # df['duration_sec_per_1k_concepts'] = df['duration_seconds_float'] / (df['cnt_tot'] / 1000)
 
     # Filtering
-    df = preprocess_null_call_groups(df, verbose)
+    df = preprocess_null_call_groups(df, verbose) # shouldn't do anything
     # Formatting
     df = df.fillna('')  # fixes problem w/ .str.contains() ops
     return df
@@ -267,11 +268,11 @@ def filter_dev_data(df: pd.DataFrame, verbose=True) -> pd.DataFrame:
     df2['codeset_ids_str'] = df2['codeset_ids'].astype(str)
     for case in test_cases:
         df_i = df2[(df2['api_call'] == 'get-csets') & (df2['codeset_ids_str'] == case)]
-        sessions.update(set(df_i['api_call_group_id']))
+        sessions.update(set(df_i['group_id']))
     # -1 is an erroneous api_call_group_id linked to otherwise contextually valid records
     if float(-1) in sessions:
         sessions.remove(float(-1))
-    df2 = df2[~df2['api_call_group_id'].isin(sessions)]
+    df2 = df2[~df2['group_id'].isin(sessions)]
     if verbose:
         print(f'Filtered out n records created by test cases: ', len_before - len(df2))
     return df2
@@ -282,9 +283,9 @@ def preprocess_null_call_groups(df: pd.DataFrame, verbose=True) -> pd.DataFrame:
     These are cases from before we added this feature, or where we called the backend directly.
 
     As of 2024/03/14 this doesn't have an effect. No NULL groups."""
-    df2 = df[~df['api_call_group_id'].isna()]
+    df2 = df[~df['group_id'].isna()]
     if verbose:
-        print(f'Filtered out n records based on null api_call_group_id: ', len(df) - len(df2))
+        print(f'Filtered out n records based on null group_id: ', len(df) - len(df2))
     return df2
 
 
@@ -302,10 +303,10 @@ def summary_stats(
     summary = {}
     summary['Total log records'] = len(df_apiruns)
     summary['Log records with session id'] = len(df_w_groups_filtered)
-    summary['Log sessions'] = len(df_w_groups_filtered['api_call_group_id'].unique())
+    summary['Log sessions'] = len(df_w_groups_filtered['group_id'].unique())
     summary['IP addresses'] = len(df_apiruns['client_ip'].unique())
     summary['Sessions with errors'] = df_w_groups_filtered[df_w_groups_filtered['result'].str.lower().str.contains('error')][
-        'api_call_group_id'].nunique()
+        'group_id'].nunique()
     summary['All API call errors'] = len(df_apiruns[df_apiruns['result'].str.lower().str.contains('error')])
 
     # Value set combos
@@ -316,14 +317,14 @@ def summary_stats(
     #  - analyze uniqueness of get-csets calls within an API call group session
     # diff_get_csets_vs_groups=35; i feel like this should be equal to some calc involving next 4 vars, but not sure
     # noinspection PyUnusedLocal
-    diff_get_csets_vs_groups = len(df_get_csets) - len(df_get_csets['api_call_group_id'].unique())  # 35
+    diff_get_csets_vs_groups = len(df_get_csets) - len(df_get_csets['group_id'].unique())  # 35
     diff_codeset_ids_in_group__n_instances = 0  # 1
     diff_codeset_ids_in_group__n_calls = 0  # 2
     mult_get_cset_in_session__n_instances = 0  # 30
     mult_get_cset_in_session__n_calls = 0  # 64
     # todo: maybe convert to str in pre-processing instead, if need 'str' more than temporarily
     df_get_csets['codeset_ids'] = df_get_csets['codeset_ids'].astype(str)
-    for group_id, group_data in df_get_csets.groupby('api_call_group_id'):
+    for group_id, group_data in df_get_csets.groupby('group_id'):
         if len(group_data) > 1 and len(group_data['codeset_ids'].unique()) > 1:
             diff_codeset_ids_in_group__n_instances += 1
             diff_codeset_ids_in_group__n_calls += len(group_data)
@@ -422,7 +423,7 @@ def plots(df: pd.DataFrame, df_dev0: pd.DataFrame, small=False, dev_data_plots=F
     # Title
     title = f'Duration, session API calls'
     # Select data
-    df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first')
+    df_i2 = df_i.drop_duplicates(subset='group_id', keep='first')
     data: List[float] = df_i2['duration_seconds_float'].tolist()
     # noinspection PyTypeChecker
     data = [x for x in data if x]  # filter null ''s
@@ -440,7 +441,7 @@ def plots(df: pd.DataFrame, df_dev0: pd.DataFrame, small=False, dev_data_plots=F
     # Title
     title = f'Session API call duration / concept count'
     # Select data
-    df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first')  # df_dev0: len 1550 --> 542
+    df_i2 = df_i.drop_duplicates(subset='group_id', keep='first')  # df_dev0: len 1550 --> 542
     df_i2 = df_i2[~df_i2['duration_sec_per_concept'].isin([np.inf, -np.inf, ''])]  # idk how '' snuck in
     data: List[float] = df_i2['duration_sec_per_concept'].tolist()
     # noinspection PyTypeChecker
@@ -474,7 +475,7 @@ def plots(df: pd.DataFrame, df_dev0: pd.DataFrame, small=False, dev_data_plots=F
 
     # Histogram: User queries - sessions
     df_i = df_dev0
-    df_i2 = df_i.drop_duplicates(subset='api_call_group_id', keep='first')  # df_dev0: len 1550 --> 542
+    df_i2 = df_i.drop_duplicates(subset='group_id', keep='first')  # df_dev0: len 1550 --> 542
     # Select data
     # noinspection DuplicatedCode
     ip_counts = {}
@@ -512,28 +513,29 @@ def run(use_cache=False, verbose=False, dev_data_plots=False):
     # Initial setup ---
     t0 = datetime.now()
     setup()
-    df_apiruns: pd.DataFrame = get_dataset_with_mods(api_runs_query, USAGE_UNJOINED_CSV_PATH, use_cache, verbose)
-    df_w_groups_filtered: pd.DataFrame = get_dataset_with_mods(usage_query, USAGE_JOINED_CSV_PATH, use_cache, verbose)
-    df_apiruns_dev0: pd.DataFrame = filter_dev_data(df_apiruns, verbose)
-    df_w_groups_filtered_dev0: pd.DataFrame = filter_dev_data(df_w_groups_filtered, verbose)
+    df_apiruns_dev1: pd.DataFrame = get_dataset_with_mods(api_runs_query, USAGE_UNJOINED_CSV_PATH, use_cache, verbose)
+    df_dev0: pd.DataFrame = get_dataset_with_mods(usage_query, USAGE_JOINED_CSV_PATH, use_cache, verbose)
+    # df_apiruns_dev0: pd.DataFrame = filter_dev_data(df_apiruns_dev1, verbose)
+    # df_w_groups_filtered_dev0: pd.DataFrame = filter_dev_data(df_w_groups_filtered, verbose)
 
+    df_out: pd.DataFrame = summary_stats(df_apiruns_dev1, df_dev0)
     # Table ---
     # Stats: With dev IPs included
-    df_out_dev1: pd.DataFrame = summary_stats(df_apiruns, df_w_groups_filtered)
+    # df_out_dev1: pd.DataFrame = summary_stats(df_apiruns_dev1, df_w_groups_filtered)
 
     # Stats: With dev IPs filtered out
-    df_out_dev0: pd.DataFrame = summary_stats(df_apiruns_dev0, df_w_groups_filtered_dev0)
+    # df_out_dev0: pd.DataFrame = summary_stats(df_apiruns_dev0, df_w_groups_filtered_dev0)
 
     # Join different output datasets
-    df_out = df_out_dev1.merge(df_out_dev0.rename(columns={'Value': 'Value_no_dev'}), on='Measure', how='outer')
+    # df_out = df_out_dev1.merge(df_out_dev0.rename(columns={'Value': 'Value_no_dev'}), on='Measure', how='outer')
     df_out.to_csv(OUT_CSV_PATH_ALL, index=False)
 
     # Plots ---
     # - From primary datasets
     # todo: would be better to combine dev0/dev1 and small(T/F) here and then make 1 call to plot() for each combo
     #  - would need refactor 'for df, name_suffix in' out of plot()
-    plots(df_w_groups_filtered, df_w_groups_filtered_dev0, False, dev_data_plots)  # Big
-    plots(df_w_groups_filtered, df_w_groups_filtered_dev0, True, dev_data_plots)  # Small
+    plots(df_apiruns_dev1, df_dev0, False, dev_data_plots)  # Big
+    plots(df_apiruns_dev1, df_dev0, True, dev_data_plots)  # Small
     if verbose:
         print(f'Finished stats report in n seconds: {(datetime.now() - t0).seconds}')
 
@@ -563,7 +565,7 @@ def run(use_cache=False, verbose=False, dev_data_plots=False):
 └─────────────────┴────────────┘
 
 
-SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,api_call_group_id
+SELECT host,client,result, replace(result, ' rows', '') concept_ids, week,group_id
 FROM public.apijoin
 WHERE api_call = 'codeset-ids-by-concept-id'