From d78e857ae0cc357f6e0251c3287cdbd5091205a8 Mon Sep 17 00:00:00 2001 From: Garen Wang Date: Wed, 27 Jan 2021 23:21:17 +0800 Subject: [PATCH 1/8] bug fixed when aggregating --- fe_util.py | 8 ++++---- requirments.txt | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fe_util.py b/fe_util.py index f498df9..1ba1f1e 100644 --- a/fe_util.py +++ b/fe_util.py @@ -99,11 +99,11 @@ def crosscount(df, col_list): def aggregate(df, num_col, col, stat_list = AGGREGATE_TYPE): - agg_dict = {} + agg_list = [] for i in stat_list: - agg_dict['AGG_{}_{}_{}'.format(i, num_col, col)] = i - agg_result = df.groupby([col])[num_col].agg(agg_dict) - r = left_merge(df, agg_result, on = [col]) + agg_list.append(('AGG_{}_{}_{}'.format(i, num_col, col), i)) + agg_result = df.groupby([col])[num_col].agg(agg_list) + r = left_merge(df, agg_result, on=[col]) df = concat([df, r]) return df diff --git a/requirments.txt b/requirments.txt index e02fd2e..e032dc2 100644 --- a/requirments.txt +++ b/requirments.txt @@ -2,5 +2,5 @@ lightgbm pandas numpy sklearn -nni==0.9.1 +nni gensim From 2215a7c8624218036b15f0eb43c33302e447e13f Mon Sep 17 00:00:00 2001 From: Garen Wang Date: Thu, 28 Jan 2021 11:15:39 +0800 Subject: [PATCH 2/8] add .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a47fb1d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +__pycache__ \ No newline at end of file From bc2e42c301f4362aef80b71dd854d0872e1b0829 Mon Sep 17 00:00:00 2001 From: Garen Wang Date: Thu, 28 Jan 2021 16:59:42 +0800 Subject: [PATCH 3/8] same bug fixed in function nunique & histstat --- fe_util.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fe_util.py b/fe_util.py index 1ba1f1e..f3ba5f6 100644 --- a/fe_util.py +++ b/fe_util.py @@ -112,9 +112,9 @@ def nunique(df, id_col, col): """ get id group_by(id) nunique """ - agg_dict = {} - agg_dict['NUNIQUE_{}_{}'.format(id_col, col)] = 'nunique' - agg_result = df.groupby([col])[id_col].agg(agg_dict) + agg_list = [] + agg_list.append(('NUNIQUE_{}_{}'.format(id_col, col), 'nunique')) + agg_result = df.groupby([col])[id_col].agg(agg_list) r = left_merge(df, agg_result, on = [col]) df = concat([df, r]) return df @@ -124,12 +124,12 @@ def histstat(df, id_col, col, stat_list = AGGREGATE_TYPE): """ get id group_by(id) histgram statitics """ - agg_dict = {} + agg_list = [] for i in stat_list: - agg_dict['HISTSTAT_{}_{}_{}'.format(i, id_col, col)] = i + agg_list.append(('HISTSTAT_{}_{}_{}'.format(i, id_col, col), i)) df['temp_count'] = df.groupby(id_col)[id_col].transform('count') - agg_result = df.groupby([col])['temp_count'].agg(agg_dict) - r = left_merge(df, agg_result, on = [col]) + agg_result = df.groupby([col])['temp_count'].agg(agg_list) + r = left_merge(df, agg_result, on=[col]) df = concat([df, r]) del df['temp_count'] return df From 07e0865923ac2ff55ac7ddc178900aa8fcd1aa1e Mon Sep 17 00:00:00 2001 From: Garen Wang Date: Thu, 28 Jan 2021 17:05:09 +0800 Subject: [PATCH 4/8] delete .gitignore --- .gitignore | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index a47fb1d..0000000 --- a/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.idea -__pycache__ \ No newline at end of file From ac06fa77a00a1b7af2e62be73ef29805d9047f96 Mon Sep 17 00:00:00 2001 From: Garen Wang Date: Thu, 28 Jan 2021 17:08:28 +0800 Subject: [PATCH 5/8] maintain original format --- fe_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fe_util.py b/fe_util.py index f3ba5f6..8c5ce7b 100644 --- a/fe_util.py +++ b/fe_util.py @@ -103,7 +103,7 @@ def aggregate(df, num_col, col, stat_list = AGGREGATE_TYPE): for i in stat_list: agg_list.append(('AGG_{}_{}_{}'.format(i, num_col, col), i)) agg_result = df.groupby([col])[num_col].agg(agg_list) - r = left_merge(df, agg_result, on=[col]) + r = left_merge(df, agg_result, on = [col]) df = concat([df, r]) return df @@ -129,7 +129,7 @@ def histstat(df, id_col, col, stat_list = AGGREGATE_TYPE): agg_list.append(('HISTSTAT_{}_{}_{}'.format(i, id_col, col), i)) df['temp_count'] = df.groupby(id_col)[id_col].transform('count') agg_result = df.groupby([col])['temp_count'].agg(agg_list) - r = left_merge(df, agg_result, on=[col]) + r = left_merge(df, agg_result, on = [col]) df = concat([df, r]) del df['temp_count'] return df From 979b640dd7b407c7be672f7a51332e53adde8c82 Mon Sep 17 00:00:00 2001 From: Garen Wang Date: Fri, 29 Jan 2021 13:09:36 +0800 Subject: [PATCH 6/8] removed unused package name --- requirments.txt => requirements.txt | 1 - 1 file changed, 1 deletion(-) rename requirments.txt => requirements.txt (82%) diff --git a/requirments.txt b/requirements.txt similarity index 82% rename from requirments.txt rename to requirements.txt index e032dc2..de8c842 100644 --- a/requirments.txt +++ b/requirements.txt @@ -3,4 +3,3 @@ pandas numpy sklearn nni -gensim From df3e69a685976414388fa94bc90bb1222fa44792 Mon Sep 17 00:00:00 2001 From: Garen Wang Date: Fri, 29 Jan 2021 13:12:55 +0800 Subject: [PATCH 7/8] add parameter min_data to lgb_model_train --- main.py | 4 +++- model.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index ff11691..b13c549 100644 --- a/main.py +++ b/main.py @@ -29,6 +29,7 @@ file_name = 'train.tiny.csv' target_name = 'Label' id_index = 'Id' + min_data = 200 # must be a factor of number of instances # get parameters from tuner RECEIVED_PARAMS = nni.get_next_parameter() @@ -43,7 +44,8 @@ # raw feaure + sample_feature df = name2feature(df, sample_col, target_name) - feature_imp, val_score = lgb_model_train(df, _epoch = 1000, target_name = target_name, id_index = id_index) + feature_imp, val_score = lgb_model_train(df, _epoch=1000, target_name=target_name, + id_index=id_index, min_data=min_data) nni.report_final_result({ "default":val_score, "feature_importance":feature_imp diff --git a/model.py b/model.py index c827848..d26ffc0 100644 --- a/model.py +++ b/model.py @@ -51,7 +51,7 @@ def train_test_split(X, y, test_size, random_state=2018): return [X_train, X_test, y_train, y_test] -def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id'): +def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id', min_data=200): df = df.loc[df[target_name].isnull()==False] feature_name = [i for i in df.columns if i not in [target_name, id_index]] for i in feature_name: @@ -68,7 +68,7 @@ def lgb_model_train( df, _epoch=1000, target_name='Label', id_index='Id'): 'num_threads': 4, 'num_leaves':64, 'learning_rate': 0.05, - 'min_data': 200, + 'min_data': min_data, 'bagging_fraction': 0.5, 'feature_fraction': 0.5, 'max_depth': -1 , From 625c0ec72c74d7750c11a0765efece29eeb5df17 Mon Sep 17 00:00:00 2001 From: Garen Wang Date: Sun, 31 Jan 2021 13:29:03 +0800 Subject: [PATCH 8/8] requirements error fixed --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index de8c842..1adca56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ pandas numpy sklearn nni +gensim \ No newline at end of file