Skip to content

Commit

Permalink
Merge pull request #130 from molgenis/feat/no_imputing
Browse files Browse the repository at this point in the history
feat: No Imputing
  • Loading branch information
marikaris authored Sep 23, 2022
2 parents 2019bce + 510f182 commit 6e180a6
Show file tree
Hide file tree
Showing 26 changed files with 245 additions and 445 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,15 @@ _For instance:_
The following argument is specific to `predict`:

- -m / --model **(required)**: The path to a custom pickled CAPICE model that includes
attributes `CAPICE_version` (`str`) and `impute_values` (`dict`). Models can be found as attachments on the [GitHub releases](https://github.com/molgenis/capice/releases) page.
attributes `CAPICE_version` (`str`) and `model_features` (`list`). Models can be found as attachments on the [GitHub releases](https://github.com/molgenis/capice/releases) page.

The following arguments are specific to `train`:

- -m / --impute **(required)**: The path to a JSON containing the impute values with the column name as key and the
impute value as value.
- -m / --impute **(required)**: The path to a JSON containing the features desired for training. Each key is a training feature, each value is ignored and can be left `NULL`.
**Please note that CAPICE is value type specific!**
- -s / --split _(optional)_: Percentage of input data that should be used to measure performance during training.
Argument should be given in float from 0.1 (10%) to 0.9 (90%), default = 0.2.
- -t / --threads _(optional)_: The amount of processing cores the training protocol can use. Default = 1.

You can also use `capice {module} --help` to show help on the command line.

Expand Down Expand Up @@ -164,7 +164,7 @@ A file will be put out containing the following element:

- `xgb_classifier`: Custom [Pickled](https://docs.python.org/3/library/pickle.html) instance of
a [XGBClassifier](https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier) instance that
has successfully trained on the input data, containing additional attributes CAPICE_version and impute_values.
has successfully trained on the input data, containing additional attributes CAPICE_version and model_features.

_Note: To load in a pickled instance of a model, use the following commands:_

Expand Down
62 changes: 62 additions & 0 deletions resources/train_features.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"PolyPhenCat": null,
"PolyPhenVal": null,
"cDNApos": null,
"relcDNApos": null,
"SIFTcat": null,
"SIFTval": null,
"protPos": null,
"relProtPos": null,
"oAA": null,
"nAA": null,
"CDSpos": null,
"relCDSpos": null,
"ref": null,
"alt": null,
"is_regulatory_region_variant": null,
"is_regulatory_region_ablation": null,
"is_regulatory_region_amplification": null,
"is_missense_variant": null,
"is_intron_variant": null,
"is_upstream_gene_variant": null,
"is_downstream_gene_variant": null,
"is_synonymous_variant": null,
"is_TF_binding_site_variant": null,
"is_splice_donor_variant": null,
"is_coding_sequence_variant": null,
"is_splice_region_variant": null,
"is_stop_gained": null,
"is_splice_acceptor_variant": null,
"is_frameshift_variant": null,
"is_3_prime_UTR_variant": null,
"is_inframe_insertion": null,
"is_inframe_deletion": null,
"is_5_prime_UTR_variant": null,
"is_start_lost": null,
"is_non_coding_transcript_exon_variant": null,
"is_non_coding_transcript_variant": null,
"is_TFBS_ablation": null,
"is_TFBS_amplification": null,
"is_protein_altering_variant": null,
"is_stop_lost": null,
"is_stop_retained_variant": null,
"is_transcript_ablation": null,
"is_intergenic_variant": null,
"is_start_retained_variant": null,
"is_transcript_amplification": null,
"is_incomplete_terminal_codon_variant": null,
"is_mature_miRNA_variant": null,
"is_NMD_transcript_variant": null,
"is_feature_elongation": null,
"is_feature_truncation": null,
"SpliceAI_pred_DP_AG": null,
"SpliceAI_pred_DP_AL": null,
"SpliceAI_pred_DP_DG": null,
"SpliceAI_pred_DP_DL": null,
"SpliceAI_pred_DS_AG": null,
"SpliceAI_pred_DS_AL": null,
"SpliceAI_pred_DS_DG": null,
"SpliceAI_pred_DS_DL": null,
"Type": null,
"Length": null
}
60 changes: 0 additions & 60 deletions resources/train_impute_values.json

This file was deleted.

2 changes: 1 addition & 1 deletion src/molgenis/capice/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '3.3.0'
__version__ = '4.0.0-rc1'
4 changes: 2 additions & 2 deletions src/molgenis/capice/cli/args_handler_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, parser):

@property
def _extension(self):
return '.tsv.gz', '.tsv'
return '.tsv.gz'

@property
def _required_output_extensions(self):
Expand All @@ -42,7 +42,7 @@ def create(self):
action='append',
type=str,
required=True,
help='path to impute values file (.json) (required)'
help='path to the json containing the features that can be used in training (required)'
)
self.parser.add_argument(
'-s',
Expand Down
2 changes: 1 addition & 1 deletion src/molgenis/capice/core/capice_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,4 @@ def export_capice_model(self, model):
with open(export_path, 'wb') as model_dump:
pickle.dump(model, model_dump)
if not self.output_given:
print('Successfully exported CAPICE model to: %s', export_path)
print('Successfully exported CAPICE model to: ', export_path)
11 changes: 0 additions & 11 deletions src/molgenis/capice/main_capice.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from molgenis.capice.utilities.input_parser import InputParser
from molgenis.capice.core.capice_exporter import CapiceExporter
from molgenis.capice.utilities.preprocessor import PreProcessor
from molgenis.capice.utilities.capice_imputing import CapiceImputing
from molgenis.capice.utilities.manual_vep_processor import ManualVEPProcessor
from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor
from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator
Expand Down Expand Up @@ -74,16 +73,6 @@ def process(loaded_data):
processed_data = processor.process(dataset=loaded_data)
return processed_data

@staticmethod
def impute(loaded_data, impute_values):
"""
Function to perform imputing over the loaded data.
self.model can be None, but impute_json has to be defined in that case.
"""
capice_imputer = CapiceImputing(impute_values=impute_values)
capice_data = capice_imputer.impute(loaded_data)
return capice_data

def preprocess(self, loaded_data, model_features=None):
"""
Function to perform the preprocessing of the loaded data to convert
Expand Down
1 change: 0 additions & 1 deletion src/molgenis/capice/main_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def run(self):
Column.feature.value,
Column.feature_type.value])
capice_data = self.process(loaded_data=capice_data)
capice_data = self.impute(loaded_data=capice_data, impute_values=self.model.impute_values)
capice_data = self.preprocess(loaded_data=capice_data,
model_features=self.model.get_booster().feature_names)
capice_data = self.predict(loaded_data=capice_data)
Expand Down
5 changes: 2 additions & 3 deletions src/molgenis/capice/main_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,12 @@ def run(self):
json_dict = json.load(impute_values_file)
self._validate_impute_complete(data, json_dict)

imputed_data = self.impute(loaded_data=data, impute_values=json_dict)
processed_data = self.preprocess(loaded_data=imputed_data)
processed_data = self.preprocess(loaded_data=data)
self._get_processed_features(dataset=processed_data, impute_keys=json_dict.keys())
processed_train, processed_test = self.split_data(dataset=processed_data,
test_size=self.train_test_size)
model = self.train(test_set=processed_test, train_set=processed_train)
setattr(model, "impute_values", json_dict)
setattr(model, "model_features", list(json_dict.keys()))
setattr(model, 'CAPICE_version', __version__)
self.exporter.export_capice_model(model=model)

Expand Down
79 changes: 0 additions & 79 deletions src/molgenis/capice/utilities/capice_imputing.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/molgenis/capice/validators/model_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def validate_has_required_attributes(model):
Function to validate if the required attributes CAPICE_version,
impute_values and predict_proba are present.
"""
required_attributes = ['CAPICE_version', 'impute_values', 'predict_proba']
required_attributes = ['CAPICE_version', 'model_features', 'predict_proba']
for attribute in required_attributes:
if attribute not in dir(model):
raise AttributeError(f'Unable to locate attribute {attribute} in model file!')
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def validate_features_present(self, datafile):
presently processed.
"""
column_utils = ColumnUtils()
column_utils.set_specified_columns(self.model.impute_values.keys())
column_utils.set_specified_columns(self.model.model_features)
features_not_present = column_utils.get_missing_diff_with(datafile.columns)
if len(features_not_present) > 0:
error_message = 'Detected required feature(s) %s not ' \
Expand Down
2 changes: 1 addition & 1 deletion src/molgenis/capice/vep/length.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Length(Template):
def __init__(self):
super(Length, self).__init__(
name=Column.ref.value,
usable=False
usable=True
)

@property
Expand Down
2 changes: 1 addition & 1 deletion src/molgenis/capice/vep/type.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Type(Template):
def __init__(self):
super(Type, self).__init__(
name=Column.ref.value,
usable=False
usable=True
)

@property
Expand Down
Loading

0 comments on commit 6e180a6

Please sign in to comment.