-
Notifications
You must be signed in to change notification settings - Fork 0
/
bin_util.py
29 lines (29 loc) · 1.16 KB
/
bin_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def generate_bin_interval(df, bin_col, count_col = None, bin_interval=5, bin_threshold=100):
"""
series: Generate bins of that series
"""
# Generating equal bin_interval sized bins
df['bins'] = df[bin_col].apply(lambda x: bin_interval*(int(x/bin_interval)+1))
# Adding bin_threshold as a stopping criteria
stopping_val = bin_threshold + bin_interval
df.loc[df[bin_col]>bin_threshold, "bins"] = stopping_val
# Creating bin count based on count criteria
if count_col:
tmp = (
df
.groupby("bins")[count_col]
.apply(pd.Series.nunique)
.sort_index().reset_index(name='%s_Count' % count_col)
)
else:
tmp = (
df
.groupby("bins").size()
.sort_index().reset_index(name='Count')
)
# Generating intervals
tmp['startbin'] = tmp['bins'] - bin_interval
tmp['bins'] = pd.Series(tmp[['startbin','bins']].astype(str).values.tolist()).str.join('-')
# Replacing last value with "threshold +" value
tmp.loc[tmp["bins"] == tmp.iloc[-1]['bins'], "bins"] = "%.2f+"%(bin_threshold)
return tmp.drop('startbin',1).set_index('bins')