Feature: Query authors and respondents #18

- Add: New function implementing basic feature: create_report_users_and_roles() Misc - Add: Comment link to user roles GoogleSheet. - Update: Renamed 'report1' and 'report2' variable and function names to be more descriptive. - Update: Reorganized run() - Update: Fixed an incorrect type. - Update: .gitignore: Added *.pickle
jhu-bids · Aug 15, 2022 · 7ee1748 · 7ee1748
1 parent b22e3ea
commit 7ee1748
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 _archive/
 _dev/
 .idea/
+*.pickle
 .env
 build/
 dev/

diff --git a/fhir_zulip_nlp/fhir_zulip_nlp.py b/fhir_zulip_nlp/fhir_zulip_nlp.py
@@ -9,6 +9,8 @@
   3. The Zulip chat we're querying: https://chat.fhir.org/#
   4. Category keywords google sheet:
      https://docs.google.com/spreadsheets/d/1OB0CEAkOhVTN71uIhzCo_iNaiD1B6qLqL7uwil5O22Q/edit#gid=1136391153
+  5. User roles google sheet:
+     https://docs.google.com/spreadsheets/d/1OB0CEAkOhVTN71uIhzCo_iNaiD1B6qLqL7uwil5O22Q/edit#gid=1504038457
 
 Possible areas of improvement
   1. Save to calling directory, not project directory.
@@ -19,6 +21,7 @@
 """
 import math
 import os
+import pickle
 import sys
 import time
 from argparse import ArgumentParser
@@ -46,8 +49,10 @@
     'zuliprc_path': os.path.join(ENV_DIR, '.zuliprc'),  # rc = "runtime config"
     'chat_stream_name': 'terminology',
     'num_messages_per_query': 1000,
-    'outpath_report1': os.path.join(PROJECT_DIR, 'zulip_report1_counts.csv'),
-    'outpath_report2': os.path.join(PROJECT_DIR, 'zulip_report2_thread_lengths.csv'),
+    'outpath_report_counts': os.path.join(PROJECT_DIR, 'zulip_report1_counts.csv'),
+    'outpath_report_thread_length': os.path.join(PROJECT_DIR, 'zulip_report2_thread_lengths.csv'),
+    'outpath_report_users': os.path.join(PROJECT_DIR, 'zulip_report3_users.csv'),
+    'outpath_report_roles': os.path.join(PROJECT_DIR, 'zulip_report4_user_roles.csv'),  # todo
     'outpath_errors': os.path.join(PROJECT_DIR, 'zulip_errors.csv'),
     'outpath_no_results': os.path.join(PROJECT_DIR, 'zulip_report_queries_with_no_results.csv'),
     'outpath_raw_results': os.path.join(PROJECT_DIR, RAW_RESULTS_FILENAME),
@@ -223,10 +228,10 @@ def _get_counts_from_kw_messages(
     return kw_report
 
 
-def create_report1(
+def create_report_counts(
     df: pd.DataFrame, category_keywords: TYPE_KEYWORDS_DICT, kw_contexts: Dict[str, List[str]]
 ) -> (pd.DataFrame, pd.DataFrame):
-    """Report 1: counts and latest/oldest message timestamps"""
+    """Report: counts and latest/oldest message timestamps"""
     reports: List[Dict] = []
     no_results: List[Dict] = []
     today = str(date.today())
@@ -252,18 +257,18 @@ def create_report1(
     df_no_results = pd.DataFrame(no_results)
 
     # Save & return
-    df_report.to_csv(CONFIG['outpath_report1'], index=False)
+    df_report.to_csv(CONFIG['outpath_report_counts'], index=False)
     if len(df_no_results) > 0:
         df_no_results.to_csv(CONFIG['outpath_no_results'], index=False)
 
     return df_report, df_no_results
 
 
-def create_report2(
+def create_report_thread_length(
     df: pd.DataFrame, category_keywords: TYPE_KEYWORDS_DICT, kw_contexts: Dict[str, List[str]],
     include_all_columns=False
-) -> (pd.DataFrame, pd.DataFrame):
-    """Report 2: thread lengths
+) -> pd.DataFrame:
+    """Report: thread lengths
     include_all_columns: Pending bugfix. If false, excludes these columns from report.
     todo: fix category / all messages counts and stddev. Right now the counts are mostly 0; not being calculated based
      on the correct message selections. Not sure if there are stddev calc issues; could just be because of counts.
@@ -369,7 +374,120 @@ def create_report2(
     df_report = pd.DataFrame(reports)
     df_report = format_df(df_report)
     # Save & return
-    df_report.to_csv(CONFIG['outpath_report2'], index=False)
+    df_report.to_csv(CONFIG['outpath_report_thread_length'], index=False)
+    return df_report
+
+
+def create_report_users(df: pd.DataFrame) -> pd.DataFrame:
+    """Report: Users
+    # todo: Pending completion of 'streams' issue, change output `stream_id` -> `stream` / `stream_name`
+    # todo: Does it make sense to refactor this to start with users first, then drill down?"""
+
+    # TODO: Refactor to collect all of the users['user_data'] first. then the following block bleow can be used for
+    #  ...thread_participation
+    pass
+
+    # stream_id_name_map = {'179202': 'terminology'}
+    #
+    # users = {}
+    # streams = df['stream_id'].unique()
+    # for stream_id in streams:
+    #     stream_name = stream_id_name_map[str(stream_id)]
+    #     df_i = df[df['stream_id'] == stream_id]
+    #     # todo: Would they like the 0 totals as well? If so, rather than .unique(), should use `category_keywords`
+    #     categories = df_i['category'].unique()
+    #     for c in categories:
+    #         df_i2 = df_i[df_i['category'] == c]
+    #         keywords = df_i2['keyword'].unique()
+    #         for k in keywords:
+    #             df_i3 = df_i2[df_i2['keyword'] == k]
+    #             threads = df_i3['subject'].unique()
+    #             for thread in threads:
+    #                 df_i4 = df_i3[df_i3['subject'] == thread]
+    #                 # Get authorship vs non-authorship
+    #                 # todo: This could probably be done in fewer lines: author and respondent section is redundant
+    #                 author_timestamp = min(df_i4['timestamp'])
+    #                 author_row: Dict = df_i4[df_i4['timestamp'] == author_timestamp].to_dict()
+    #                 author_row = {k: list(v.values())[0] for k, v in author_row.items()}
+    #                 author_id = author_row['sender_id']
+    #                 participants = {
+    #                     author_id: {
+    #                         'role': 'author',
+    #                         'user_id': author_id,
+    #                         'full_name': author_row['sender_full_name'],
+    #                         'email': author_row['sender_email']
+    #                     }
+    #                 }
+    #                 respondent_id_set = set(list(df_i4['sender_id'].unique()))
+    #                 respondent_id_set.remove(author_id)
+    #                 for resp_id in list(respondent_id_set):
+    #                     resp_row: Dict = df_i4[df_i4['sender_id'] == resp_id].to_dict()
+    #                     resp_row = {k: list(v.values())[0] for k, v in resp_row.items()}
+    #                     participants[resp_id] = {
+    #                         'role': 'respondent',
+    #                         'user_id': resp_row['sender_id'],
+    #                         'full_name': resp_row['sender_full_name'],
+    #                         'email': resp_row['sender_email']
+    #                     }
+    #
+    #                 # Collect all information and put in users
+    #                 for user_id, user_data in participants.items():
+    #                     if user_id not in users:
+    #                         users[user_id] = {
+    #                             'user_data': {
+    #                                 'user_id': user_data['user_id'],
+    #                                 'full_name': user_data['full_name'],
+    #                                 'email': user_data['email']
+    #                             },
+    #                             'thread_participation': {
+    #                               'category': {}
+    #                             }
+    #                         }
+    #                     # todo: I really don't like the way I've done this. All the `noinspection` is indicative of that
+    #                     if c not in users[user_id]['thread_participation']['category']:
+    #                         users[user_id]['thread_participation']['category'][c] = {'keyword': {}}
+    #                     # noinspection PyTypeChecker
+    #                     if k not in users[user_id]['thread_participation']['category'][c]['keyword']:
+    #                         # noinspection PyTypeChecker,PyUnresolvedReferences
+    #                         users[user_id]['thread_participation']['category'][c]['keyword'][k] = \
+    #                             {'thread_roles': {}}
+    #                     # noinspection PyTypeChecker,PyUnresolvedReferences
+    #                     users[user_id]['thread_participation']['category'][c]['keyword'][k]['thread_roles'][thread] = \
+    #                         user_data['role']
+    #
+
+    # # TODO: undo temp pickle & uncomment above
+    # with open('users.pickle', 'wb') as handle:
+    #     pickle.dump(users, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    with open('users.pickle', 'rb') as handle:
+        users = pickle.load(handle)
+
+    # TODO: Tell which they are: author or respondent; can tell author if first timestamp?
+    # TODO: divide into threads, get counts, and then re-aggregate
+    for user in users:
+        user_id = ''  # todo
+        user_email = ''  # todo
+        user_fullname = ''  # todo
+        author_count = 0  # todo
+        respondent_count = 0  # todo
+        participant_count = author_count + respondent_count
+
+        # TODO: How to combine aggregates and non-aggregates? type=aggregate|item, type_name=?
+        # TODO: add category and keyword
+        # noinspection PyUnboundLocalVariable
+        result = {
+            'stream': stream_name, 'user_id': user_id, 'user_email': user_email,
+            'user_fullname': user_fullname,
+            'author_count': author_count, 'respondent_count': respondent_count,
+            'participant_count': participant_count}
+
+    # TODO: Summarize total counts for: category, stream, all
+    pass
+
+    # Save & return
+    results: List[Dict] = []
+    df_report = pd.DataFrame(results)
+    df_report.to_csv(CONFIG['outpath_report_users'], index=False)
     return df_report
 
 
@@ -461,13 +579,15 @@ def _get_keyword_contexts(use_cached_keyword_inputs=False) -> Dict[str, List[str
 
 def run(analyze_only=False, use_cached_keyword_inputs=False):
     """Run program"""
+    # Get inputs
     keywords: TYPE_KEYWORDS_DICT = _get_keywords(use_cached_keyword_inputs)
+    # kw_contexts: Dict[str, List[str]] = _get_keyword_contexts()
+    # Get messages
     message_df: pd.DataFrame = query_categories(keywords) if not analyze_only else _load_cached_messages()
-    kw_contexts: Dict[str, List[str]] = _get_keyword_contexts()
-    # - report 1: counts and latest/oldest message timestamps && keywords w/ no results
-    create_report1(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
-    # - report 2: thread lengths
-    create_report2(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
+    # Create reports
+    # create_report_counts(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
+    # create_report_thread_length(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
+    create_report_users(message_df)
 
 
 def cli():