Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Aggregation Bug Fixed & Support Provided for Example Datasets #12

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
18 changes: 9 additions & 9 deletions fe_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ def crosscount(df, col_list):


def aggregate(df, num_col, col, stat_list = AGGREGATE_TYPE):
agg_dict = {}
agg_list = []
for i in stat_list:
agg_dict['AGG_{}_{}_{}'.format(i, num_col, col)] = i
agg_result = df.groupby([col])[num_col].agg(agg_dict)
agg_list.append(('AGG_{}_{}_{}'.format(i, num_col, col), i))
agg_result = df.groupby([col])[num_col].agg(agg_list)
r = left_merge(df, agg_result, on = [col])
df = concat([df, r])
return df
Expand All @@ -112,9 +112,9 @@ def nunique(df, id_col, col):
"""
get id group_by(id) nunique
"""
agg_dict = {}
agg_dict['NUNIQUE_{}_{}'.format(id_col, col)] = 'nunique'
agg_result = df.groupby([col])[id_col].agg(agg_dict)
agg_list = []
agg_list.append(('NUNIQUE_{}_{}'.format(id_col, col), 'nunique'))
agg_result = df.groupby([col])[id_col].agg(agg_list)
r = left_merge(df, agg_result, on = [col])
df = concat([df, r])
return df
Expand All @@ -124,11 +124,11 @@ def histstat(df, id_col, col, stat_list = AGGREGATE_TYPE):
"""
get id group_by(id) histgram statitics
"""
agg_dict = {}
agg_list = []
for i in stat_list:
agg_dict['HISTSTAT_{}_{}_{}'.format(i, id_col, col)] = i
agg_list.append(('HISTSTAT_{}_{}_{}'.format(i, id_col, col), i))
df['temp_count'] = df.groupby(id_col)[id_col].transform('count')
agg_result = df.groupby([col])['temp_count'].agg(agg_dict)
agg_result = df.groupby([col])['temp_count'].agg(agg_list)
r = left_merge(df, agg_result, on = [col])
df = concat([df, r])
del df['temp_count']
Expand Down
4 changes: 3 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
file_name = 'train.tiny.csv'
target_name = 'Label'
id_index = 'Id'
min_data = 200 # must be a factor of number of instances

# get parameters from tuner
RECEIVED_PARAMS = nni.get_next_parameter()
Expand All @@ -43,7 +44,8 @@

# raw feaure + sample_feature
df = name2feature(df, sample_col, target_name)
feature_imp, val_score = lgb_model_train(df, _epoch = 1000, target_name = target_name, id_index = id_index)
feature_imp, val_score = lgb_model_train(df, _epoch=1000, target_name=target_name,
id_index=id_index, min_data=min_data)
nni.report_final_result({
"default":val_score,
"feature_importance":feature_imp
Expand Down
4 changes: 2 additions & 2 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def train_test_split(X, y, test_size, random_state=2018):
return [X_train, X_test, y_train, y_test]


def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id'):
def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id', min_data=200):
df = df.loc[df[target_name].isnull()==False]
feature_name = [i for i in df.columns if i not in [target_name, id_index]]
for i in feature_name:
Expand All @@ -68,7 +68,7 @@ def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id'):
'num_threads': 4,
'num_leaves':64,
'learning_rate': 0.05,
'min_data': 200,
'min_data': min_data,
'bagging_fraction': 0.5,
'feature_fraction': 0.5,
'max_depth': -1 ,
Expand Down
4 changes: 2 additions & 2 deletions requirments.txt → requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ lightgbm
pandas
numpy
sklearn
nni==0.9.1
gensim
nni
gensim