diff --git a/fe_util.py b/fe_util.py index f498df9..8c5ce7b 100644 --- a/fe_util.py +++ b/fe_util.py @@ -99,10 +99,10 @@ def crosscount(df, col_list): def aggregate(df, num_col, col, stat_list = AGGREGATE_TYPE): - agg_dict = {} + agg_list = [] for i in stat_list: - agg_dict['AGG_{}_{}_{}'.format(i, num_col, col)] = i - agg_result = df.groupby([col])[num_col].agg(agg_dict) + agg_list.append(('AGG_{}_{}_{}'.format(i, num_col, col), i)) + agg_result = df.groupby([col])[num_col].agg(agg_list) r = left_merge(df, agg_result, on = [col]) df = concat([df, r]) return df @@ -112,9 +112,9 @@ def nunique(df, id_col, col): """ get id group_by(id) nunique """ - agg_dict = {} - agg_dict['NUNIQUE_{}_{}'.format(id_col, col)] = 'nunique' - agg_result = df.groupby([col])[id_col].agg(agg_dict) + agg_list = [] + agg_list.append(('NUNIQUE_{}_{}'.format(id_col, col), 'nunique')) + agg_result = df.groupby([col])[id_col].agg(agg_list) r = left_merge(df, agg_result, on = [col]) df = concat([df, r]) return df @@ -124,11 +124,11 @@ def histstat(df, id_col, col, stat_list = AGGREGATE_TYPE): """ get id group_by(id) histgram statitics """ - agg_dict = {} + agg_list = [] for i in stat_list: - agg_dict['HISTSTAT_{}_{}_{}'.format(i, id_col, col)] = i + agg_list.append(('HISTSTAT_{}_{}_{}'.format(i, id_col, col), i)) df['temp_count'] = df.groupby(id_col)[id_col].transform('count') - agg_result = df.groupby([col])['temp_count'].agg(agg_dict) + agg_result = df.groupby([col])['temp_count'].agg(agg_list) r = left_merge(df, agg_result, on = [col]) df = concat([df, r]) del df['temp_count'] diff --git a/main.py b/main.py index ff11691..b13c549 100644 --- a/main.py +++ b/main.py @@ -29,6 +29,7 @@ file_name = 'train.tiny.csv' target_name = 'Label' id_index = 'Id' + min_data = 200 # must be a factor of number of instances # get parameters from tuner RECEIVED_PARAMS = nni.get_next_parameter() @@ -43,7 +44,8 @@ # raw feaure + sample_feature df = name2feature(df, sample_col, target_name) - feature_imp, val_score = lgb_model_train(df, _epoch = 1000, target_name = target_name, id_index = id_index) + feature_imp, val_score = lgb_model_train(df, _epoch=1000, target_name=target_name, + id_index=id_index, min_data=min_data) nni.report_final_result({ "default":val_score, "feature_importance":feature_imp diff --git a/model.py b/model.py index c827848..d26ffc0 100644 --- a/model.py +++ b/model.py @@ -51,7 +51,7 @@ def train_test_split(X, y, test_size, random_state=2018): return [X_train, X_test, y_train, y_test] -def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id'): +def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id', min_data=200): df = df.loc[df[target_name].isnull()==False] feature_name = [i for i in df.columns if i not in [target_name, id_index]] for i in feature_name: @@ -68,7 +68,7 @@ def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id'): 'num_threads': 4, 'num_leaves':64, 'learning_rate': 0.05, - 'min_data': 200, + 'min_data': min_data, 'bagging_fraction': 0.5, 'feature_fraction': 0.5, 'max_depth': -1 , diff --git a/requirments.txt b/requirements.txt similarity index 62% rename from requirments.txt rename to requirements.txt index e02fd2e..1adca56 100644 --- a/requirments.txt +++ b/requirements.txt @@ -2,5 +2,5 @@ lightgbm pandas numpy sklearn -nni==0.9.1 -gensim +nni +gensim \ No newline at end of file