-
Notifications
You must be signed in to change notification settings - Fork 0
/
funcs.py
361 lines (313 loc) · 17.5 KB
/
funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import pandas as pd
import numpy as np
import streamlit as st
import base64
from scipy.stats import ttest_rel, ttest_ind, levene
import math
# ---------------------
# HELPER FUNCTIONS
# ---------------------
# def form_callback():
# st.session_state.myform = True
# acknowledgers
# ---------------
acknowledgers = ['Ok','Sounds good','Perfect','Great','Alrighty', 'Got it']#, Fantastic 'Excellent',
neg_acknowledgers = ["That's ok", "That's alright", "No problem"]
exclamations = ['!','.']
@st.cache
def generate_acks(m,n,seed):
np.random.seed(seed)
acks = []
neg_acks = []
for i in range(m):
acks.append(acknowledgers[np.random.randint(0,len(acknowledgers))] + exclamations[np.random.randint(0,len(exclamations))])
for i in range(n):
neg_acks.append(neg_acknowledgers[np.random.randint(0,len(neg_acknowledgers))] + exclamations[np.random.randint(0,len(exclamations))])
return acks , neg_acks
# display PDFs
# --------------
def displayPDF(file):
# Opening file from file path
with open(file, "rb") as f:
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
# Embedding PDF in HTML
pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
# Displaying File
st.markdown(pdf_display, unsafe_allow_html=True)
# detect outliers function
# --------------------------
def is_outlier(points, thresh=3.5):
"""
Returns a boolean array with True if points are outliers and False
otherwise.
Parameters:
-----------
points : An numobservations by numdimensions array of observations
thresh : The modified z-score to use as a threshold. Observations with
a modified z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
Returns:
--------
mask : A numobservations-length boolean array.
References:
----------
Boris Iglewicz and David Hoaglin (1993), pp. 11-12, "Volume 16: How to Detect and
Handle Outliers", The ASQC Basic References in Quality Control:
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
# if len(points.shape) == 1:
# points = points[:,None]
# points = np.array(points)
median = np.median(points, axis=0)
# diff = np.sum((points - median)**2, axis=-1)
diff = (points-median)**2
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
# conduct independent samples t-test
# ------------------------------------
def custom_ttest(_group1,_group2,test_type,_0s_desired=None,_0s_included=None,n1=None,n2=None,_is_outlier = is_outlier):
if test_type == 'ind':
# check for outliers, not counting 0s.
# note that if we find outliers and trim them with Yuen's t-test (below), we basically guarantee
# normality of the test metric. the CLT is only violated if the mean of the distribution
# of sample means is not defined, which basically only happens when there are signifcant outliers
# and the distribution has a long tail.
# we will not include $0 transactions when determining outliers, because all nonzero values may be determined
# to be outliers, and normality will still hold as long as there are no outliers within the nonzero data.
outliers1 = sum(_is_outlier(_group1[(_group1 != 0) & (~pd.isna(_group1))]))
outliers2 = sum(_is_outlier(_group2[(_group2 != 0) & (~pd.isna(_group2))]))
group1_nona = _group1[~pd.isna(_group1)]
group2_nona = _group2[~pd.isna(_group2)]
yes_no_bool = {'Yes':True,'No':False}
_0s_desired = yes_no_bool[_0s_desired]
_0s_included = yes_no_bool[_0s_included]
if (_0s_desired and _0s_included) or not (_0s_desired or _0s_included):
group1 = group1_nona
group2 = group2_nona
elif _0s_desired and not _0s_included:
group1 = group1_nona.append(pd.Series(np.repeat(0,n1-len(group1_nona))))
group2 = group2_nona.append(pd.Series(np.repeat(0,n2-len(group2_nona))))
elif (not _0s_desired) and _0s_included:
group1 = group1_nona[group1_nona != 0]
group2 = group2_nona[group2_nona != 0]
# if there were outliers, set trim value using new group length (if we added 0s)
if outliers1 > 0 or outliers2 > 0:
# set trim to the max fraction of outliers between the 2 groups. cannot be >= .5,
# because the impact of this number of observations will be reduced from each side of the distribution
trim = round(min(.49,max(outliers1/len(group1),outliers2/len(group2))),6)
# print(min(_group1[(_group1 != 0) & (~pd.isna(_group1))][_is_outlier(_group1[(_group1 != 0) & (~pd.isna(_group1))])]))
# print(min(_group2[(_group2 != 0) & (~pd.isna(_group2))][_is_outlier(_group2[(_group2 != 0) & (~pd.isna(_group2))])]))
else:
trim = 0
# print(outliers1)
# print(outliers2)
# print(trim)
# check for equality of variance. levene seems to not be sensitive to assumptions of normality.
var_test = levene(group1,group2)
# print(var_test[1])
# if p val is not significant at 10% level, assume variances are equal
if var_test[1] >= .1:
result = ttest_ind(group1,group2,equal_var=True,trim=trim)
# otherwise, can assume they are unequal
elif var_test[1] < .1:
result = ttest_ind(group1,group2,equal_var=False,trim=trim)
elif test_type == 'rel':
# there should be no NAs in this data
group1 = _group1
group2 = _group2
trim = 0
result = ttest_rel(group1,group2)
# print(len(group1))
# print(len(group2))
# print(sum(group1))
# print(sum(group2))
pval = result[1]
mean1 = group1.mean()
mean2 = group2.mean()
# if Yuen's trimmed t-test is conducted, let the user know
ntrim1 = int(np.floor(trim*len(group1))) # from scipy ttest_ind documentation
ntrim2 = int(np.floor(trim*len(group2)))
# print(ntrim1)
trimmed = ntrim1 > 0 or ntrim2 > 0
if trimmed:
st.markdown('''
Significant outliers were detected in your data, so we've run a _Trimmed Means t-test_ and a _Mann-Whitney U test_.
These tests use different methods to account for departures of normality of the sample mean, which are likely
when outliers are present.
''')
st.markdown('''
##### Trimmed Means t-test
A [Trimmed Means t-test] (https://www.real-statistics.com/students-t-distribution/problems-data-t-tests/trimmed-means-t-test) is a t-test that
compares the "trimmed means," i.e. the means after removing outliers, but still includes all values for the variance
calculation. The overall effect is to reduce the impact of the outliers on the test.
''')
st.markdown('Mean 1: **{:.4f}**. Mean 2: **{:.4f}**. Mean difference: **{:.4f}**'.format(mean1,mean2,mean1-mean2))
if trimmed:
# trimmed means are just the means after removing trimmed values
sort1 = group1.sort_values()
sort2 = group2.sort_values()
if ntrim1 == 0:
tmean1 = mean1
trimmed_vals1 = pd.Series({1:'None'}).rename('Group 1 trimmed values')
else:
tmean1 = sort1[ntrim1:-ntrim1].mean()
trimmed_vals1 = sort1[:ntrim1].append(sort1[-ntrim1:])
# make index start with 1 to be user-friendly
trimmed_vals1.index = np.arange(1,len(trimmed_vals1) + 1)
if ntrim2 == 0:
tmean2 = mean2
trimmed_vals2 = pd.Series({1:'None'}).rename('Group 2 trimmed values')
else:
tmean2 = sort2[ntrim2:-ntrim2].mean()
trimmed_vals2 = sort2[:ntrim2].append(sort2[-ntrim2:])
# make index start with 1 to be user-friendly
trimmed_vals2.index = np.arange(1,len(trimmed_vals2) + 1)
st.markdown('Trimmed mean 1: **{:.4f}**. Trimmed mean 2: **{:.4f}**. Trimmed mean difference: **{:.4f}**'.format(tmean1,tmean2,tmean1-tmean2))
st.markdown('**P-Value: '+'{:.3f}**'.format(pval))
if round(pval,3) <= .010:
st.success('This difference is significant at the 1% level.')
elif round(pval,3) <= .050:
st.success('This difference is significant at the 5% level.')
elif round(pval,3) <= .100:
st.success('This difference is significant at the 10% level.')
elif np.isnan(pval):
st.error('''Your test result couldn't be calculated. Make sure you have at least 2
observations in each group, and for a paired t-test, equal sample sizes.
''')
else:
st.warning('''This difference would not typically be considered statistically significant. Be sure to consult
the 'Evaluate a test' section to confirm that you have a large enough sample to detect your desired effect size.
''')
if trimmed:
st.write('''
You can see the trimmed values below. Note that the same proportion of values must be trimmed from both ends
of each group. In this case, we trimmed {:.3%} of observations from each end (if this rounds to 0, the group is unaffected).
'''.format(trim))
col5, col6 = st.columns(2)
col5.write(trimmed_vals1.rename('Group 1 trimmed values'))
col6.write(trimmed_vals2.rename('Group 2 trimmed values'))
st.markdown('''
##### Mann-Whitney U test
We've also conducted a [Mann-Whitney U test] (https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), a nonparametric
statistical test that is robust in the presence of outliers. It is sometimes interpreted as a comparison of medians, as opposed
to means.
''')
# calculate Mann-Whitney statistic and p-value. the statistic represents the number of pairwise comparisons between the two groups
# the first group wins. the total number of pairwise comparisons is n1*n2. the pval is the chance that the two groups are drawn from the
# distribution
from scipy.stats import mannwhitneyu
u1, mw_pval = mannwhitneyu(group1,group2)
# print(u1)
# print(mw_pval)
n1 = len(group1)
n2 = len(group2)
# calculate the 'common language effect size', i.e. the proportion of pairwise comparisons won by the winning group
u2 = n1*n2 - u1
cles = max(u1/(n1*n2),u2/(n1*n2))
if u1 > u2:
st.markdown('Items from Group 1 appear to be larger than items from Group 2. ')
st.markdown(f'''
__Out of {format(int(n1*n2),',d')} pairwise comparisons__
between items of the two groups, __Group 1 won {format(int(u1),',d')} ({format(u1/(u1+u2),'.1%')})__ and __Group 2 won
{format(int(u2),',d')} ({format(u2/(u1+u2),'.1%')})__. Group 1 won {format(int(u1-u2),',d')} more.
''')
elif u2 > u1:
st.markdown('Items from Group 2 appear to be larger than items from Group 1.')
st.markdown(f'''
__Out of {format(int(n1*n2),',d')} pairwise comparisons__
between items of the two groups, __Group 2 won {format(int(u2),',d')} ({format(u2/(u1+u2),'.1%')})__ and __Group 1 won
{format(int(u1),',d')} ({format(u1/(u1+u2),'.1%')})__. Group 2 won {format(int(u2-u1),',d')} more.
''')
st.markdown('**P-Value: '+'{:.3f}**'.format(mw_pval))
if round(mw_pval,3) <= .010:
st.success('This difference is significant at the 1% level.')
elif round(mw_pval,3) <= .050:
st.success('This difference is significant at the 5% level.')
elif round(mw_pval,3) <= .100:
st.success('This difference is significant at the 10% level.')
elif np.isnan(mw_pval):
# TODO: FIND MINIMUM SAMPLE SIZE
st.error('''Your test result couldn't be calculated. Make sure you have at least 2
observations in each group!
''')
else:
st.warning('''This difference would not typically be considered statistically significant. Be sure to consult
the 'Evaluate a test' section to confirm that you have a large enough sample to detect your desired effect size.
''')
def ttest_pval_dropdowns():
st.write('''If you'd like to learn more about t-tests or p-values, click below:''')
# col6, col7 = st.columns([1,2])
# pval_info = col7.button('Learn more about p-values')
ttest_info = st.expander('Learn more about t-tests')
ttest_info.markdown('''
A two-sample __t-test__ is a statistical hypothesis test that helps determine whether there is any real
difference between two sets of data, by comparing their means (averages) and variances.
As with any hypothesis test, it proposes a _null hypothesis_ and an _alternative hypothesis_. The null
hypothesis states that the two sets of observations were
drawn from the same distribution, i.e. that there is no real difference between them. If the _p-value_
produced by the test is low enough, we may reject the null hypothesis and
conclude that there is a real difference between the two groups.
__Be sure to pay attention to which mean is
greater__ -- the test is only concerned with the absolute difference between the two groups, not the
direction of the difference.
''')
pval_info = st.expander('Learn more about p-values')
pval_info.markdown('''
###### Overview
A __p-value__ [(Wikipedia)] (https://en.wikipedia.org/wiki/P-value) is a number between 0 and 1 that is output by statistical
hypothesis tests such as t-tests.
In the case of two-sample t-tests, it represents the probability that the difference in means (averages) observed between
two sets of data is due to _random chance_. The larger the difference, and the less variablity there is in the data,
the lower the p-value and the more certain we can be that there is a real difference between the two sets.
Technically speaking, when the p-value is low enough, we may reject the _null hypothesis_, which states that the
two sets of data are drawn from the same distribution.
###### Thresholds
There are various rules of thumb for p-value thresholds. You may notice that in this app, we specifically call out
thresholds of .1, .05, and .01 -- these are three very common ones. If our p-value is less than .01 for example, we may say
that we have _99% certainty_ that there is a difference between the two sets of data. Usually, when there is less data
we may allow a higher threshold, because smaller sample sizes have more variability. Therefore it is less likely
that we will observe a difference even if it exists. (Consider drawing red and blue marbles from a jar, and trying to determine
whether the split of red and blue marbles is 50-50. Let's say that after 4 draws,
you hold 3 blue marbles and 1 red. Are you ready to reject the hypothesis? How about if after 400 draws you hold 300 blue and 100 red?
We become more confident after more draws, because of the [law of large numbers] (https://en.wikipedia.org/wiki/Law_of_large_numbers).)
Accordingly, for data with fewer than 100 observations, a .1 threshold may be sufficient to reject the null hypothesis, whereas for
data with more than 1000, a threshold of .01 would likely be needed.
###### A Word of Caution
It is important to interpret the p-value yourself based on your use case, and not just rely on comparison against a threshold.
For example, if you have 95 observations and your p-value comes out to .11, wouldn't you agree that there is _some_ significance there?
Statistics deals with _probabilities_, so looking to it for black and white answers, while tempting, can also be dangerous because
it oversimplifies the reality. Always use your best judgment, and feel free to reach out to the Decision Science team with questions
about how to interpret your results! We are always happy to help :).
''')
def sample_size_calc(expected,split,var_oec,monthly):
pct_changes = list(range(5,55,5))
sample_sizes = []
times = []
for pct in pct_changes:
d = expected*pct/100
non_50_factor = 1/((4*split/100)*(1-split/100))
# Kohavi, p. 175. Note that this is NOT the same as letting the smaller variant get to the per-variant sample size.
# it is less.
n = non_50_factor * 2*16*var_oec/d**2
# Kohavi, p. 152
t = n/monthly
sample_sizes.append(format(int(math.ceil(n)),',d'))
times.append('{:.1f}'.format(t))
st.markdown('''Below are recommended total samples to achieve 80% power and 95% confidence, and the resulting runtimes in months,
for various percents change to the evaluation metric:
''')
# CSS to inject contained in a string
# hide_dataframe_row_index = """
# <style>
# .row_heading.level0 {display:none}
# .blank {display:none}
# </style>
# """
# Inject CSS with Markdown
# st.markdown(hide_dataframe_row_index, unsafe_allow_html=True)
df = pd.DataFrame({'Pct Change':pct_changes, 'Total Samples Required':sample_sizes, 'Test Duration': times})
df.index = np.arange(1,df.shape[0]+1)
st.dataframe(df,height=500)
st.write('Source: Kohavi, Formula 2, p. 152 (assumes a 50/50 split). This can be scaled according to Section 6.2.4, p. 175.')