Skip to content

Commit

Permalink
Feature: Query authors and respondents #18
Browse files Browse the repository at this point in the history
- Add: New function implementing basic feature: create_report_users_and_roles()
- Add: Documentation for feature to README.md

Misc
- Add: Codebook section at bottom of README.md documentation.
- Add: Comment link to user roles GoogleSheet.
- Update: Renamed 'report1' and 'report2' variable and function names to be more descriptive.
- Update: Reorganized run()
- Update: Fixed an incorrect type.
- Update: .gitignore: Added *.pickle
  • Loading branch information
joeflack4 committed Aug 16, 2022
1 parent b22e3ea commit c4ce0fb
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ __pycache__/
_archive/
_dev/
.idea/
*.pickle
.env
build/
dev/
Expand Down
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,35 @@ pattern `fhir-zulip-nlp*.csv`.

Releases from previous runs have been uploaded on [GitHub](https://github.com/jhu-bids/fhir-zulip-nlp-analysis/releases)
and [GoogleDrive](https://drive.google.com/drive/u/0/folders/16MFLnKoKA5gk4ELbSnVS2GCjgR_R0ETL).

### Codebook
#### `zulip_raw_results.csv`
TODO

#### `zulip_raw_results_user_participation.csv`
TODO

#### `zulip_report_queries_with_no_results.csv`
TODO

#### `zulip_user_info.csv`
TODO

#### `zulip_report1_counts.csv`
TODO

#### `zulip_report2_thread_lengths.csv`
TODO

#### `zulip_report3_users.csv`
- `user.id` (`integer`): Zulip-assigned user ID. Can look up this ID in `zulip_user_info.csv` to find more info.
- `user.full_name` (`string`): Full name of user.
- `stream` (`string`): Zulip stream.
- `category` (`string`): Category as defined in [GoogleSheet](https://docs.google.com/spreadsheets/d/1OB0CEAkOhVTN71uIhzCo_iNaiD1B6qLqL7uwil5O22Q/).
- `keyword` (`string`): Keyword as defined in [GoogleSheet](https://docs.google.com/spreadsheets/d/1OB0CEAkOhVTN71uIhzCo_iNaiD1B6qLqL7uwil5O22Q/).
- `role` (`string`): Either 'author' or 'respondent'.
- `count` (`integer`): The sum total for of the combination of **`user` x `stream` x `category` x `keyword` x `role`**
for the given row. If any of these cells in the row is empty, it means that this row represents a higher-level
aggregation. E.g. if only `keyword` is empty, the `count` represents the combo of **`user` x `stream` x `category` x
`role`**. If `keyword` and `category` are empty, the `count` represents the combo of **`user` x `stream` x
`role`**.
158 changes: 142 additions & 16 deletions fhir_zulip_nlp/fhir_zulip_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
3. The Zulip chat we're querying: https://chat.fhir.org/#
4. Category keywords google sheet:
https://docs.google.com/spreadsheets/d/1OB0CEAkOhVTN71uIhzCo_iNaiD1B6qLqL7uwil5O22Q/edit#gid=1136391153
5. User roles google sheet:
https://docs.google.com/spreadsheets/d/1OB0CEAkOhVTN71uIhzCo_iNaiD1B6qLqL7uwil5O22Q/edit#gid=1504038457
Possible areas of improvement
1. Save to calling directory, not project directory.
Expand All @@ -23,7 +25,7 @@
import time
from argparse import ArgumentParser
from datetime import datetime, date
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import zulip
import pandas as pd
Expand All @@ -46,11 +48,15 @@
'zuliprc_path': os.path.join(ENV_DIR, '.zuliprc'), # rc = "runtime config"
'chat_stream_name': 'terminology',
'num_messages_per_query': 1000,
'outpath_report1': os.path.join(PROJECT_DIR, 'zulip_report1_counts.csv'),
'outpath_report2': os.path.join(PROJECT_DIR, 'zulip_report2_thread_lengths.csv'),
'outpath_user_info': os.path.join(PROJECT_DIR, 'zulip_user_info.csv'),
'outpath_report_counts': os.path.join(PROJECT_DIR, 'zulip_report1_counts.csv'),
'outpath_report_thread_length': os.path.join(PROJECT_DIR, 'zulip_report2_thread_lengths.csv'),
'outpath_report_users': os.path.join(PROJECT_DIR, 'zulip_report3_users.csv'),
'outpath_report_roles': os.path.join(PROJECT_DIR, 'zulip_report4_user_roles.csv'), # todo
'outpath_errors': os.path.join(PROJECT_DIR, 'zulip_errors.csv'),
'outpath_no_results': os.path.join(PROJECT_DIR, 'zulip_report_queries_with_no_results.csv'),
'outpath_raw_results': os.path.join(PROJECT_DIR, RAW_RESULTS_FILENAME),
'outpath_raw_results_user_participation': os.path.join(PROJECT_DIR, 'zulip_raw_results_user_participation.csv'),
'results_cache_path': os.path.join(CACHE_DIR, RAW_RESULTS_FILENAME),
'keywords_cache_path': os.path.join(CACHE_DIR, 'keywords.csv'),
}
Expand Down Expand Up @@ -223,10 +229,10 @@ def _get_counts_from_kw_messages(
return kw_report


def create_report1(
def create_report_counts(
df: pd.DataFrame, category_keywords: TYPE_KEYWORDS_DICT, kw_contexts: Dict[str, List[str]]
) -> (pd.DataFrame, pd.DataFrame):
"""Report 1: counts and latest/oldest message timestamps"""
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Report: counts and latest/oldest message timestamps"""
reports: List[Dict] = []
no_results: List[Dict] = []
today = str(date.today())
Expand All @@ -252,18 +258,18 @@ def create_report1(
df_no_results = pd.DataFrame(no_results)

# Save & return
df_report.to_csv(CONFIG['outpath_report1'], index=False)
df_report.to_csv(CONFIG['outpath_report_counts'], index=False)
if len(df_no_results) > 0:
df_no_results.to_csv(CONFIG['outpath_no_results'], index=False)

return df_report, df_no_results


def create_report2(
def create_report_thread_length(
df: pd.DataFrame, category_keywords: TYPE_KEYWORDS_DICT, kw_contexts: Dict[str, List[str]],
include_all_columns=False
) -> (pd.DataFrame, pd.DataFrame):
"""Report 2: thread lengths
) -> pd.DataFrame:
"""Report: thread lengths
include_all_columns: Pending bugfix. If false, excludes these columns from report.
todo: fix category / all messages counts and stddev. Right now the counts are mostly 0; not being calculated based
on the correct message selections. Not sure if there are stddev calc issues; could just be because of counts.
Expand Down Expand Up @@ -369,10 +375,128 @@ def create_report2(
df_report = pd.DataFrame(reports)
df_report = format_df(df_report)
# Save & return
df_report.to_csv(CONFIG['outpath_report2'], index=False)
df_report.to_csv(CONFIG['outpath_report_thread_length'], index=False)
return df_report


def create_report_users(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Report: Users
# todo: Pending completion of 'streams' issue, change output `stream_id` -> `stream` / `stream_name`
# todo: Does it make sense to refactor this to start with users first, then drill down?"""
stream_id_name_map = {'179202': 'terminology'}

# Get basic user data
user_ids: List[int] = list(df['sender_id'].unique())
user_data: Dict[int, Dict[str, Any]] = {}
for user_id in user_ids:
df_i = df[df['sender_id'] == user_id].iloc[0]
user_data[user_id] = {
'user_id': user_id,
'full_name': df_i['sender_full_name'],
'email': df_i['sender_email']
}
user_info_df = pd.DataFrame(user_data.values())
user_info_df.to_csv(CONFIG['outpath_user_info'], index=False)

# todo: not sure which of these user data structures i'll use, or both
# TODO: I really don't like how repetitive this is
user_participation_by_user: Dict[int, List[Dict[str, Any]]] = {u: [] for u in user_ids}
user_participation: List[Dict[str, Any]] = []
streams = df['stream_id'].unique()
for stream_id in streams:
stream_name = stream_id_name_map[str(stream_id)]
df_i = df[df['stream_id'] == stream_id]
# todo: Would they like the 0 totals as well? If so, rather than .unique(), should use `category_keywords`
categories = df_i['category'].unique()
for c in categories:
df_i2 = df_i[df_i['category'] == c]
keywords = df_i2['keyword'].unique()
for k in keywords:
df_i3 = df_i2[df_i2['keyword'] == k]
threads = df_i3['subject'].unique()
for thread in threads:
df_i4 = df_i3[df_i3['subject'] == thread]
# Get authorship vs non-authorship
participant_roles = {p: 'respondent' for p in df_i4['sender_id'].unique()}
author_timestamp = min(df_i4['timestamp'])
author_id: int = list(
df_i4[df_i4['timestamp'] == author_timestamp]['sender_id'].to_dict().values())[0]
participant_roles[author_id] = 'author'

# Populate: user_participation
for user_id, role in participant_roles.items():
row = {
'user.id': user_id,
'user.full_name': user_data[user_id]['full_name'],
'stream': stream_name,
'category': c,
'keyword': k,
'thread': thread,
'user.role': role
}
user_participation_by_user[user_id].append(row)
user_participation.append(row)
user_participation_df = pd.DataFrame(user_participation)
user_participation_df.to_csv(CONFIG['outpath_raw_results_user_participation'], index=False)

# TODO: I really don't like how repetitive this is; even worse than previous block
# TODO: Have aggregated to keyword, category, and stream, but not to role agnostic of stream. Would be useful to add
# ...this, once streams feature is complete.
# TODO: aggregate to agnostic of role? for every level? stream, category, keyword? If so, can call 'participant'
user_participation_stats = []
for user_id in user_ids:
df_i = user_participation_df[user_participation_df['user.id'] == user_id]
streams = list(df_i['stream'].unique())
for s in streams:
df_i2 = df_i[df_i['stream'] == s]
role_counts: Dict[str, int] = df_i2['user.role'].value_counts().to_dict()
for role, count in role_counts.items():
row = {
'user.id': user_id,
'user.full_name': user_data[user_id]['full_name'],
'stream': s,
'category': '',
'keyword': '',
'role': role,
'count': count,
}
user_participation_stats.append(row)
categories = list(df_i2['category'].unique())
for c in categories:
df_i3 = df_i2[df_i2['category'] == c]
role_counts: Dict[str, int] = df_i3['user.role'].value_counts().to_dict()
for role, count in role_counts.items():
row = {
'user.id': user_id,
'user.full_name': user_data[user_id]['full_name'],
'stream': s,
'category': c,
'keyword': '',
'role': role,
'count': count,
}
user_participation_stats.append(row)
keywords = list(df_i2['keyword'].unique())
for k in keywords:
df_i4 = df_i3[df_i3['keyword'] == k]
role_counts: Dict[str, int] = df_i4['user.role'].value_counts().to_dict()
for role, count in role_counts.items():
row = {
'user.id': user_id,
'user.full_name': user_data[user_id]['full_name'],
'stream': s,
'category': c,
'keyword': k,
'role': role,
'count': count,
}
user_participation_stats.append(row)
user_participation_stats_df = pd.DataFrame(user_participation_stats)
user_participation_stats_df.to_csv(CONFIG['outpath_report_users'], index=False)

return user_info_df, user_participation_df, user_participation_stats_df


# TODO: In order to account for the possibility that people could edit their prior messages, can add as a param to this
# ...function "account_for_edits"=True, or "cache"=False
# ...People like Peter Jordan are constantly editing their messages =P
Expand Down Expand Up @@ -461,13 +585,15 @@ def _get_keyword_contexts(use_cached_keyword_inputs=False) -> Dict[str, List[str

def run(analyze_only=False, use_cached_keyword_inputs=False):
"""Run program"""
# Get inputs
keywords: TYPE_KEYWORDS_DICT = _get_keywords(use_cached_keyword_inputs)
message_df: pd.DataFrame = query_categories(keywords) if not analyze_only else _load_cached_messages()
kw_contexts: Dict[str, List[str]] = _get_keyword_contexts()
# - report 1: counts and latest/oldest message timestamps && keywords w/ no results
create_report1(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
# - report 2: thread lengths
create_report2(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
# Get messages
message_df: pd.DataFrame = query_categories(keywords) if not analyze_only else _load_cached_messages()
# Create reports
create_report_counts(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
create_report_thread_length(df=message_df, category_keywords=keywords, kw_contexts=kw_contexts)
create_report_users(message_df)


def cli():
Expand Down

0 comments on commit c4ce0fb

Please sign in to comment.