Skip to content

Commit

Permalink
- Bugfix: Report 1: Fixed a case where there were no rows in filtered…
Browse files Browse the repository at this point in the history
… dataframe

- UX: For GoogleSheet token refresh error, added a note that the user can simply run the script again.
- Code style: Updates for PEP8 compliance.
- Refactor: Renamed some index variables in a hopefully easier to read way: i, i2, i3
- Bugfix: Situations where no threads were found; no longer divides by 0.
- Bugfix: zulip_report_keywords_with_no_results.csv: Was showing the keyword, but not spelling or context. Updated to be accurate and more informative.
- UX: Removed messages that print for every item that didn't have results. There's a report saved for this.
- Update: Column names; variable names.
- UX: Temporarily de-activated some report 2 / standard deviation and thread length columns. Pending bugfix.
  • Loading branch information
joeflack4 committed Aug 9, 2022
1 parent 673f2a0 commit b22e3ea
Showing 1 changed file with 52 additions and 42 deletions.
94 changes: 52 additions & 42 deletions fhir_zulip_nlp/fhir_zulip_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
'outpath_report1': os.path.join(PROJECT_DIR, 'zulip_report1_counts.csv'),
'outpath_report2': os.path.join(PROJECT_DIR, 'zulip_report2_thread_lengths.csv'),
'outpath_errors': os.path.join(PROJECT_DIR, 'zulip_errors.csv'),
'outpath_no_results': os.path.join(PROJECT_DIR, 'zulip_report_keywords_with_no_results.csv'),
'outpath_no_results': os.path.join(PROJECT_DIR, 'zulip_report_queries_with_no_results.csv'),
'outpath_raw_results': os.path.join(PROJECT_DIR, RAW_RESULTS_FILENAME),
'results_cache_path': os.path.join(CACHE_DIR, RAW_RESULTS_FILENAME),
'keywords_cache_path': os.path.join(CACHE_DIR, 'keywords.csv'),
Expand Down Expand Up @@ -99,6 +99,10 @@ def _load_keywords_df(
f'\n- File name: {os.path.basename(cache_path)}'
f'\n- Last modified: {last_modified}'
f'\n- Error: {str(err)}', file=sys.stderr)
if 'invalid_grant' in str(err):
raise RuntimeError(
'GoogleSheets token will automatically be refreshed the next time this program is run. You can '
'simply run again immediately.')
df: pd.DataFrame = pd.read_csv(cache_path).fillna('')

# Massage
Expand Down Expand Up @@ -138,7 +142,6 @@ def query_keyword(
continue
messages += res['messages']
if not messages:
print(f'No messages found for: {keyword}')
break
anchor = messages[-1]['id'] # returned messages are in chronological order; -1 is most recent in batch
if res['found_newest']: # this assumes API will reliably always return this
Expand Down Expand Up @@ -199,9 +202,11 @@ def _get_counts_from_kw_messages(
if context:
df = _get_messages_with_context(df, context)
# Calculate thread length
df = df.sort_values(['timestamp']) # oldest first
z = ((list(df['timestamp'])[-1] - list(df['timestamp'])[0]) / 86400)
threadlen = f'{z:.1f}'
threadlen = ''
if len(df) > 0:
df = df.sort_values(['timestamp']) # oldest first
z = ((list(df['timestamp'])[-1] - list(df['timestamp'])[0]) / 86400)
threadlen = f'{z:.1f}'
# Create report
kw_report = {
'category': category,
Expand All @@ -223,7 +228,7 @@ def create_report1(
) -> (pd.DataFrame, pd.DataFrame):
"""Report 1: counts and latest/oldest message timestamps"""
reports: List[Dict] = []
no_result_keywords: List[str] = []
no_results: List[Dict] = []
today = str(date.today())
for c, keywords in category_keywords.items():
for k, spellings in keywords.items():
Expand All @@ -236,16 +241,15 @@ def create_report1(
kw_report: Dict = _get_counts_from_kw_messages(
df=df_i, category=c, keyword=k, spelling=s, today=today, context=context)
if kw_report['num_messages_with_kw_spelling'] == 0:
no_result_keywords.append(k)
no_results.append({'keyword': k, 'spelling': s, 'context': context, 'results': 0})
reports.append(kw_report)

# Report
df_report = pd.DataFrame(reports)
df_report = format_df(df_report)

# No results report
df_no_results = pd.DataFrame()
df_no_results['keywords_with_no_results'] = no_result_keywords
df_no_results = pd.DataFrame(no_results)

# Save & return
df_report.to_csv(CONFIG['outpath_report1'], index=False)
Expand All @@ -256,24 +260,29 @@ def create_report1(


def create_report2(
df: pd.DataFrame, category_keywords: TYPE_KEYWORDS_DICT, kw_contexts: Dict[str, List[str]]
df: pd.DataFrame, category_keywords: TYPE_KEYWORDS_DICT, kw_contexts: Dict[str, List[str]],
include_all_columns=False
) -> (pd.DataFrame, pd.DataFrame):
"""Report 2: thread lengths"""
"""Report 2: thread lengths
include_all_columns: Pending bugfix. If false, excludes these columns from report.
todo: fix category / all messages counts and stddev. Right now the counts are mostly 0; not being calculated based
on the correct message selections. Not sure if there are stddev calc issues; could just be because of counts.
"""
reports: List[Dict] = []
seconds_per_day = 86400
today = date.today()
tot_all, std_all, num_all_threads = 0, 0, 0
avg_total,std_tot = 0,0
for j, (category, keywords) in enumerate(category_keywords.items()):
tot_all, std_all, num_all_threads = 0, 0, 0
avg_total, std_tot = 0, 0
for i, (category, keywords) in enumerate(category_keywords.items()):
tot_category, var_category, avg_category, std_category, num_threads = 0, 0, 0, 0, 0

for a,(k, spellings) in enumerate(keywords.items()):
for i2, (k, spellings) in enumerate(keywords.items()):
contexts = kw_contexts.get(k, [])
# add null context '': needed to capture messages where no context words appear
contexts = contexts + [''] if '' not in contexts else contexts
for s in (spellings):
df_i = df[df['keyword_spelling'] == s]
for context in (contexts):
for spelling in spellings:
df_i = df[df['keyword_spelling'] == spelling]
for context in contexts:
df_j = _get_messages_with_context(df_i, context)
df_j = df_j.sort_values(['timestamp']) # oldest first
threads: List[str] = list(df_j['subject'].unique())
Expand All @@ -293,7 +302,7 @@ def create_report2(
thread_data[thread] = df_thread
num_all_threads += 1
tot_all += thread_len
avg_len_kw_thread = round(tot_thread_len / len(threads), 3)
avg_len_kw_thread = round(tot_thread_len / len(threads), 3) if threads else 0
# Outliers
# TODO: Refactor to pandas to reduce lines and improve performance?
# TODO: Add cols for 1 and 2 std deviations?
Expand All @@ -306,12 +315,12 @@ def create_report2(
thread_len = (list(df_thread['timestamp'])[-1]
- list(df_thread['timestamp'])[0]) / seconds_per_day
sum_square_distance += (float(thread_len) - float(avg_len_kw_thread)) ** 2
stddev_kw_threads = math.sqrt(sum_square_distance / len(threads))
stddev_kw_threads = math.sqrt(sum_square_distance / len(threads)) if threads else 0
# Calc how many std deviations away per thread
tot_category += tot_thread_len
var_category += stddev_kw_threads ** 2
std_all += stddev_kw_threads ** 2
for i, thread in enumerate(threads):
for i3, thread in enumerate(threads):
outlier = False
df_thread = thread_data[thread]
thread_len = (list(df_thread['timestamp'])[-1]
Expand All @@ -321,39 +330,40 @@ def create_report2(
or thread_len < avg_len_kw_thread - stddev_kw_threads:
outlier = True
std_away = abs(thread_len - avg_len_kw_thread) / stddev_kw_threads
# Calc URL
t = dict(df_thread.iloc[0]) # representative row of whole df; all values should be same
url = 'https://chat.fhir.org/#narrow/' + \
f'{t["type"]}/{t["stream_id"]}-{t["display_recipient"]}' + f'/topic/{t["subject"]}'
# Append to report

if i == len(threads) - 1:
if i3 == len(threads) - 1:
avg_category = round(tot_category / num_threads, 2)
std_category = round(math.sqrt(var_category / num_threads), 2)
num_threads = 0
tot_category = 0
var_category = 0
if a == len(list(keywords.keys()))-1:
if i2 == len(list(keywords.keys())) - 1:
avg_total = round((tot_all / num_all_threads), 2)
std_tot = round(math.sqrt(std_all / num_all_threads), 2)


# Calc URL
thread_df = dict(df_thread.iloc[0]) # representative row of whole df; all values should be same
url = 'https://chat.fhir.org/#narrow/' + \
f'{thread_df["type"]}/{thread_df["stream_id"]}-{thread_df["display_recipient"]}' + \
f'/topic/{thread_df["subject"]}'
# Append to report
kw_report = {
'category': category,
'keyword': k,
'kw_avg_thread_len': str(avg_len_kw_thread),
'thread_name': thread,
'thread_length_days': f'{thread_len:.1f}',
'thread_stddev_from_kw_avg_thread_len': str(round(std_away, 2)),
'outlier?': str(outlier),
'avg_total': str(avg_total),
'std_total': str(std_tot),
'avg_category': avg_category,
'std_category': std_category,
'thread': thread,
'thread_url': url,
'query_date': today
'thread_len_days': f'{thread_len:.1f}',
'avg_len_kw_outlier?': outlier,
'avg_len_kw': str(avg_len_kw_thread),
'stddev_kw': str(round(std_away, 2)),
'query_date': today,
}
avg_category,std_category,avg_total,std_tot = 0,0,0,0
if include_all_columns:
kw_report = {**kw_report, **{
'avg_len_category': avg_category,
'stddev_category': std_category,
'avg_len_total': str(avg_total),
'stddev_total': str(std_tot),
}}
avg_category, std_category, avg_total, std_tot = 0, 0, 0, 0
reports.append(kw_report)

df_report = pd.DataFrame(reports)
Expand Down

0 comments on commit b22e3ea

Please sign in to comment.