Skip to content

Commit

Permalink
Feature: Query authors and respondents #18
Browse files Browse the repository at this point in the history
- Add: New function implementing basic feature: create_report_users_and_roles()

Misc
- Add: Comment link to user roles GoogleSheet.
- Update: Renamed 'report1' and 'report2' variable and function names to be more descriptive.
- Update: Reorganized run()
- Update: Fixed an incorrect type.
- Update: .gitignore: Added *.pickle
  • Loading branch information
joeflack4 committed Aug 15, 2022
1 parent b22e3ea commit 7ee1748
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 14 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ __pycache__/
_archive/
_dev/
.idea/
*.pickle
.env
build/
dev/
Expand Down
148 changes: 134 additions & 14 deletions fhir_zulip_nlp/fhir_zulip_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
3. The Zulip chat we're querying: https://chat.fhir.org/#
4. Category keywords google sheet:
https://docs.google.com/spreadsheets/d/1OB0CEAkOhVTN71uIhzCo_iNaiD1B6qLqL7uwil5O22Q/edit#gid=1136391153
5. User roles google sheet:
https://docs.google.com/spreadsheets/d/1OB0CEAkOhVTN71uIhzCo_iNaiD1B6qLqL7uwil5O22Q/edit#gid=1504038457
Possible areas of improvement
1. Save to calling directory, not project directory.
Expand All @@ -19,6 +21,7 @@
"""
import math
import os
import pickle
import sys
import time
from argparse import ArgumentParser
Expand Down Expand Up @@ -46,8 +49,10 @@
'zuliprc_path': os.path.join(ENV_DIR, '.zuliprc'), # rc = "runtime config"
'chat_stream_name': 'terminology',
'num_messages_per_query': 1000,
'outpath_report1': os.path.join(PROJECT_DIR, 'zulip_report1_counts.csv'),
'outpath_report2': os.path.join(PROJECT_DIR, 'zulip_report2_thread_lengths.csv'),
'outpath_report_counts': os.path.join(PROJECT_DIR, 'zulip_report1_counts.csv'),
'outpath_report_thread_length': os.path.join(PROJECT_DIR, 'zulip_report2_thread_lengths.csv'),
'outpath_report_users': os.path.join(PROJECT_DIR, 'zulip_report3_users.csv'),
'outpath_report_roles': os.path.join(PROJECT_DIR, 'zulip_report4_user_roles.csv'), # todo
'outpath_errors': os.path.join(PROJECT_DIR, 'zulip_errors.csv'),
'outpath_no_results': os.path.join(PROJECT_DIR, 'zulip_report_queries_with_no_results.csv'),
'outpath_raw_results': os.path.join(PROJECT_DIR, RAW_RESULTS_FILENAME),
Expand Down Expand Up @@ -223,10 +228,10 @@ def _get_counts_from_kw_messages(
return kw_report


def create_report1(
def create_report_counts(
df: pd.DataFrame, category_keywords: TYPE_KEYWORDS_DICT, kw_contexts: Dict[str, List[str]]
) -> (pd.DataFrame, pd.DataFrame):
"""Report 1: counts and latest/oldest message timestamps"""
"""Report: counts and latest/oldest message timestamps"""
reports: List[Dict] = []
no_results: List[Dict] = []
today = str(date.today())
Expand All @@ -252,18 +257,18 @@ def create_report1(
df_no_results = pd.DataFrame(no_results)

# Save & return
df_report.to_csv(CONFIG['outpath_report1'], index=False)
df_report.to_csv(CONFIG['outpath_report_counts'], index=False)
if len(df_no_results) > 0:
df_no_results.to_csv(CONFIG['outpath_no_results'], index=False)

return df_report, df_no_results


def create_report2(
def create_report_thread_length(
df: pd.DataFrame, category_keywords: TYPE_KEYWORDS_DICT, kw_contexts: Dict[str, List[str]],
include_all_columns=False
) -> (pd.DataFrame, pd.DataFrame):
"""Report 2: thread lengths
) -> pd.DataFrame:
"""Report: thread lengths
include_all_columns: Pending bugfix. If false, excludes these columns from report.
todo: fix category / all messages counts and stddev. Right now the counts are mostly 0; not being calculated based
on the correct message selections. Not sure if there are stddev calc issues; could just be because of counts.
Expand Down Expand Up @@ -369,7 +374,120 @@ def create_report2(
df_report = pd.DataFrame(reports)
df_report = format_df(df_report)
# Save & return
df_report.to_csv(CONFIG['outpath_report2'], index=False)
df_report.to_csv(CONFIG['outpath_report_thread_length'], index=False)
return df_report


def create_report_users(df: pd.DataFrame) -> pd.DataFrame:
"""Report: Users
# todo: Pending completion of 'streams' issue, change output `stream_id` -> `stream` / `stream_name`
# todo: Does it make sense to refactor this to start with users first, then drill down?"""

# TODO: Refactor to collect all of the users['user_data'] first. then the following block bleow can be used for
# ...thread_participation
pass

# stream_id_name_map = {'179202': 'terminology'}
#
# users = {}
# streams = df['stream_id'].unique()
# for stream_id in streams:
# stream_name = stream_id_name_map[str(stream_id)]
# df_i = df[df['stream_id'] == stream_id]
# # todo: Would they like the 0 totals as well? If so, rather than .unique(), should use `category_keywords`
# categories = df_i['category'].unique()
# for c in categories:
# df_i2 = df_i[df_i['category'] == c]
# keywords = df_i2['keyword'].unique()
# for k in keywords:
# df_i3 = df_i2[df_i2['keyword'] == k]
# threads = df_i3['subject'].unique()
# for thread in threads:
# df_i4 = df_i3[df_i3['subject'] == thread]
# # Get authorship vs non-authorship
# # todo: This could probably be done in fewer lines: author and respondent section is redundant
# author_timestamp = min(df_i4['timestamp'])
# author_row: Dict = df_i4[df_i4['timestamp'] == author_timestamp].to_dict()
# author_row = {k: list(v.values())[0] for k, v in author_row.items()}
# author_id = author_row['sender_id']
# participants = {
# author_id: {
# 'role': 'author',
# 'user_id': author_id,
# 'full_name': author_row['sender_full_name'],
# 'email': author_row['sender_email']
# }
# }
# respondent_id_set = set(list(df_i4['sender_id'].unique()))
# respondent_id_set.remove(author_id)
# for resp_id in list(respondent_id_set):
# resp_row: Dict = df_i4[df_i4['sender_id'] == resp_id].to_dict()
# resp_row = {k: list(v.values())[0] for k, v in resp_row.items()}
# participants[resp_id] = {
# 'role': 'respondent',
# 'user_id': resp_row['sender_id'],
# 'full_name': resp_row['sender_full_name'],
# 'email': resp_row['sender_email']
# }
#
# # Collect all information and put in users
# for user_id, user_data in participants.items():
# if user_id not in users:
# users[user_id] = {
# 'user_data': {
# 'user_id': user_data['user_id'],
# 'full_name': user_data['full_name'],
# 'email': user_data['email']
# },
# 'thread_participation': {
# 'category': {}
# }
# }
# # todo: I really don't like the way I've done this. All the `noinspection` is indicative of that
# if c not in users[user_id]['thread_participation']['category']:
# users[user_id]['thread_participation']['category'][c] = {'keyword': {}}
# # noinspection PyTypeChecker
# if k not in users[user_id]['thread_participation']['category'][c]['keyword']:
# # noinspection PyTypeChecker,PyUnresolvedReferences
# users[user_id]['thread_participation']['category'][c]['keyword'][k] = \
# {'thread_roles': {}}
# # noinspection PyTypeChecker,PyUnresolvedReferences
# users[user_id]['thread_participation']['category'][c]['keyword'][k]['thread_roles'][thread] = \
# user_data['role']
#

# # TODO: undo temp pickle & uncomment above
# with open('users.pickle', 'wb') as handle:
# pickle.dump(users, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('users.pickle', 'rb') as handle:
users = pickle.load(handle)

# TODO: Tell which they are: author or respondent; can tell author if first timestamp?
# TODO: divide into threads, get counts, and then re-aggregate
for user in users:
user_id = '' # todo
user_email = '' # todo
user_fullname = '' # todo
author_count = 0 # todo
respondent_count = 0 # todo
participant_count = author_count + respondent_count

# TODO: How to combine aggregates and non-aggregates? type=aggregate|item, type_name=?
# TODO: add category and keyword
# noinspection PyUnboundLocalVariable
result = {
'stream': stream_name, 'user_id': user_id, 'user_email': user_email,
'user_fullname': user_fullname,
'author_count': author_count, 'respondent_count': respondent_count,
'participant_count': participant_count}

# TODO: Summarize total counts for: category, stream, all
pass

# Save & return
results: List[Dict] = []
df_report = pd.DataFrame(results)
df_report.to_csv(CONFIG['outpath_report_users'], index=False)
return df_report


Expand Down Expand Up @@ -461,13 +579,15 @@ def _get_keyword_contexts(use_cached_keyword_inputs=False) -> Dict[str, List[str

def run(analyze_only=False, use_cached_keyword_inputs=False):
"""Run program"""
# Get inputs
keywords: TYPE_KEYWORDS_DICT = _get_keywords(use_cached_keyword_inputs)
# kw_contexts: Dict[str, List[str]] = _get_keyword_contexts()
# Get messages
message_df: pd.DataFrame = query_categories(keywords) if not analyze_only else _load_cached_messages()
kw_contexts: Dict[str, List[str]] = _get_keyword_contexts()
# - report 1: counts and latest/oldest message timestamps && keywords w/ no results
create_report1(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
# - report 2: thread lengths
create_report2(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
# Create reports
# create_report_counts(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
# create_report_thread_length(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
create_report_users(message_df)


def cli():
Expand Down

0 comments on commit 7ee1748

Please sign in to comment.