Skip to content

Commit

Permalink
Merge pull request #131 from molgenis/feat/update_libraries
Browse files Browse the repository at this point in the history
feat: Updated libraries
  • Loading branch information
marikaris authored Sep 26, 2022
2 parents 6e180a6 + 6594a1a commit 0140de5
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 21 deletions.
37 changes: 24 additions & 13 deletions scripts/balance_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,21 +209,22 @@ def split(dataset: pd.DataFrame):
all_benign._is_copy = None
v_benign_samples = all_benign.sample(frac=0.1, random_state=__random_state__)
# A bit cryptic to remove the random samples from the benign dataset, but it works
all_benign = all_benign.append(v_benign_samples)
all_benign = pd.concat([all_benign, v_benign_samples], axis=0, ignore_index=True)
all_benign.drop_duplicates(keep=False, inplace=True)
return_dataset = return_dataset.append(all_benign, ignore_index=True)
validation_dataset = validation_dataset.append(v_benign_samples, ignore_index=True)
return_dataset = pd.concat([return_dataset, all_benign], axis=0, ignore_index=True)
validation_dataset = pd.concat([validation_dataset, v_benign_samples], axis=0,
ignore_index=True)

# Pathogenic
all_pathogenic = dataset[dataset['binarized_label'] == 1]
all_pathogenic._is_copy = None
v_patho_samples = all_pathogenic.sample(frac=0.1, random_state=__random_state__)
# Again a cryptic way to remove the randomly samples pathogenic samples
all_pathogenic = all_pathogenic.append(v_patho_samples)
all_pathogenic = pd.concat([all_pathogenic, v_patho_samples], axis=0, ignore_index=True)
all_pathogenic.drop_duplicates(keep=False, inplace=True)
return_dataset = return_dataset.append(all_pathogenic, ignore_index=True)
validation_dataset = validation_dataset.append(v_patho_samples, ignore_index=True)

return_dataset = pd.concat([return_dataset, all_pathogenic], axis=0, ignore_index=True)
validation_dataset = pd.concat([validation_dataset, v_patho_samples], axis=0,
ignore_index=True)
return validation_dataset, return_dataset


Expand All @@ -247,7 +248,12 @@ def balance(self, dataset: pd.DataFrame):
processed_consequence = self._process_consequence(
pathogenic_dataset=selected_pathogenic, benign_dataset=selected_benign
)
return_dataset = return_dataset.append(processed_consequence)
return_dataset = pd.concat(
[
return_dataset,
processed_consequence
], axis=0, ignore_index=True
)
return return_dataset

def _process_consequence(self, pathogenic_dataset, benign_dataset):
Expand All @@ -267,10 +273,13 @@ def _process_consequence(self, pathogenic_dataset, benign_dataset):
lower_bound = bins[ind]
upper_bound = bins[ind + 1]
sample_number = pathogenic_histogram[ind]
processed_bins = processed_bins.append(
self._process_bins(
pathogenic_dataset, benign_dataset, upper_bound, lower_bound, sample_number
)
processed_bins = pd.concat(
[
processed_bins,
self._process_bins(
pathogenic_dataset, benign_dataset, upper_bound, lower_bound, sample_number
)
], axis=0, ignore_index=True
)
return processed_bins

Expand All @@ -295,7 +304,9 @@ def _process_bins(
selected_benign.shape[0],
random_state=__random_state__
)
return return_benign.append(return_pathogenic, ignore_index=True)
return pd.concat(
[return_benign, return_pathogenic], axis=0, ignore_index=True
)

@staticmethod
def _get_variants_within_range(dataset, upper_bound, lower_bound):
Expand Down
10 changes: 5 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@
],
python_requires='>=3.8',
install_requires=[
'numpy==1.22.0',
'pandas==1.3.5',
'scipy==1.7.3',
'scikit-learn==1.0.2',
'xgboost==1.4.2'
'numpy==1.23.2',
'pandas==1.4.4',
'scipy==1.9.1',
'scikit-learn==1.1.2',
'xgboost==1.6.2'
],
extras_require={
'testing': [
Expand Down
2 changes: 1 addition & 1 deletion src/molgenis/capice/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '4.0.0-rc1'
__version__ = '4.0.0-rc2'
8 changes: 6 additions & 2 deletions src/molgenis/capice/main_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@ def train(self, test_set: pd.DataFrame, train_set: pd.DataFrame):
random_state=self.model_random_state,
use_label_encoder=False
)
model_estimator.set_params(
**{
'eval_metric': ["auc"],
'early_stopping_rounds': self.esr
}
)
randomised_search_cv = RandomizedSearchCV(estimator=model_estimator,
param_distributions=param_dist,
scoring='roc_auc', n_jobs=8,
Expand All @@ -191,8 +197,6 @@ def train(self, test_set: pd.DataFrame, train_set: pd.DataFrame):
self.log.info('Random search starting, please hold.')
randomised_search_cv.fit(train_set[self.processed_features],
train_set[TrainEnums.binarized_label.value],
early_stopping_rounds=self.esr,
eval_metric=["auc"],
eval_set=eval_set,
verbose=xgb_verbosity,
sample_weight=train_set[TrainEnums.sample_weight.value])
Expand Down
12 changes: 12 additions & 0 deletions tests/capice/test_main_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,18 @@ def test_integration_training(self):
best_model = str(model.__class__).split("'")[1]
self.assertEqual('xgboost.sklearn.XGBClassifier', best_model)

def test_params(self):
"""
Test to see if the >1.6.2 XGBoost parameter settings are applied correctly to the model
"""
print('Test params')
self.main.run()
output_path = os.path.join(self.output_dir, self.output_filename)
with open(output_path, 'rb') as model_dat:
model = pickle.load(model_dat)
self.assertEqual(model.get_params()['early_stopping_rounds'], 1)
self.assertEqual(model.get_params()['eval_metric'], ['auc'])

def test_unit_split(self):
"""
Unit test to see if split works.
Expand Down
Binary file modified tests/resources/xgb_booster_poc.pickle.dat
Binary file not shown.

0 comments on commit 0140de5

Please sign in to comment.