Merge pull request #130 from molgenis/feat/no_imputing

feat: No Imputing
molgenis · Sep 23, 2022 · 6e180a6 · 6e180a6
2 parents 2019bce + 510f182
commit 6e180a6
Show file tree

Hide file tree

Showing 26 changed files with 245 additions and 445 deletions.
diff --git a/README.md b/README.md
@@ -120,15 +120,15 @@ _For instance:_
 The following argument is specific to `predict`:
 
 - -m / --model **(required)**: The path to a custom pickled CAPICE model that includes
-  attributes `CAPICE_version` (`str`) and `impute_values` (`dict`). Models can be found as attachments on the [GitHub releases](https://github.com/molgenis/capice/releases) page.
+  attributes `CAPICE_version` (`str`) and `model_features` (`list`). Models can be found as attachments on the [GitHub releases](https://github.com/molgenis/capice/releases) page.
 
 The following arguments are specific to `train`:
 
-- -m / --impute **(required)**: The path to a JSON containing the impute values with the column name as key and the
-  impute value as value.
+- -m / --impute **(required)**: The path to a JSON containing the features desired for training. Each key is a training feature, each value is ignored and can be left `NULL`.
   **Please note that CAPICE is value type specific!**
 - -s / --split _(optional)_: Percentage of input data that should be used to measure performance during training.
   Argument should be given in float from 0.1 (10%) to 0.9 (90%), default = 0.2.
+- -t / --threads _(optional)_: The amount of processing cores the training protocol can use. Default = 1.
 
 You can also use `capice {module} --help` to show help on the command line.
 
@@ -164,7 +164,7 @@ A file will be put out containing the following element:
 
 - `xgb_classifier`: Custom [Pickled](https://docs.python.org/3/library/pickle.html) instance of
   a [XGBClassifier](https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier) instance that
-  has successfully trained on the input data, containing additional attributes CAPICE_version and impute_values.
+  has successfully trained on the input data, containing additional attributes CAPICE_version and model_features.
 
 _Note: To load in a pickled instance of a model, use the following commands:_
 

diff --git a/resources/train_features.json b/resources/train_features.json
@@ -0,0 +1,62 @@
+{
+    "PolyPhenCat": null,
+    "PolyPhenVal": null,
+    "cDNApos": null,
+    "relcDNApos": null,
+    "SIFTcat": null,
+    "SIFTval": null,
+    "protPos": null,
+    "relProtPos": null,
+    "oAA": null,
+    "nAA": null,
+    "CDSpos": null,
+    "relCDSpos": null,
+    "ref": null,
+    "alt": null,
+    "is_regulatory_region_variant": null,
+    "is_regulatory_region_ablation": null,
+    "is_regulatory_region_amplification": null,
+    "is_missense_variant": null,
+    "is_intron_variant": null,
+    "is_upstream_gene_variant": null,
+    "is_downstream_gene_variant": null,
+    "is_synonymous_variant": null,
+    "is_TF_binding_site_variant": null,
+    "is_splice_donor_variant": null,
+    "is_coding_sequence_variant": null,
+    "is_splice_region_variant": null,
+    "is_stop_gained": null,
+    "is_splice_acceptor_variant": null,
+    "is_frameshift_variant": null,
+    "is_3_prime_UTR_variant": null,
+    "is_inframe_insertion": null,
+    "is_inframe_deletion": null,
+    "is_5_prime_UTR_variant": null,
+    "is_start_lost": null,
+    "is_non_coding_transcript_exon_variant": null,
+    "is_non_coding_transcript_variant": null,
+    "is_TFBS_ablation": null,
+    "is_TFBS_amplification": null,
+    "is_protein_altering_variant": null,
+    "is_stop_lost": null,
+    "is_stop_retained_variant": null,
+    "is_transcript_ablation": null,
+    "is_intergenic_variant": null,
+    "is_start_retained_variant": null,
+    "is_transcript_amplification": null,
+    "is_incomplete_terminal_codon_variant": null,
+    "is_mature_miRNA_variant": null,
+    "is_NMD_transcript_variant": null,
+    "is_feature_elongation": null,
+    "is_feature_truncation": null,
+    "SpliceAI_pred_DP_AG": null,
+    "SpliceAI_pred_DP_AL": null,
+    "SpliceAI_pred_DP_DG": null,
+    "SpliceAI_pred_DP_DL": null,
+    "SpliceAI_pred_DS_AG": null,
+    "SpliceAI_pred_DS_AL": null,
+    "SpliceAI_pred_DS_DG": null,
+    "SpliceAI_pred_DS_DL": null,
+    "Type": null,
+    "Length": null
+}
diff --git a/resources/train_impute_values.json b/resources/train_impute_values.json
diff --git a/src/molgenis/capice/__init__.py b/src/molgenis/capice/__init__.py
@@ -1 +1 @@
-__version__ = '3.3.0'
+__version__ = '4.0.0-rc1'
diff --git a/src/molgenis/capice/cli/args_handler_train.py b/src/molgenis/capice/cli/args_handler_train.py
@@ -17,7 +17,7 @@ def __init__(self, parser):
 
     @property
     def _extension(self):
-        return '.tsv.gz', '.tsv'
+        return '.tsv.gz'
 
     @property
     def _required_output_extensions(self):
@@ -42,7 +42,7 @@ def create(self):
             action='append',
             type=str,
             required=True,
-            help='path to impute values file (.json) (required)'
+            help='path to the json containing the features that can be used in training (required)'
         )
         self.parser.add_argument(
             '-s',

diff --git a/src/molgenis/capice/core/capice_exporter.py b/src/molgenis/capice/core/capice_exporter.py
@@ -66,4 +66,4 @@ def export_capice_model(self, model):
         with open(export_path, 'wb') as model_dump:
             pickle.dump(model, model_dump)
         if not self.output_given:
-            print('Successfully exported CAPICE model to: %s', export_path)
+            print('Successfully exported CAPICE model to: ', export_path)
diff --git a/src/molgenis/capice/main_capice.py b/src/molgenis/capice/main_capice.py
@@ -5,7 +5,6 @@
 from molgenis.capice.utilities.input_parser import InputParser
 from molgenis.capice.core.capice_exporter import CapiceExporter
 from molgenis.capice.utilities.preprocessor import PreProcessor
-from molgenis.capice.utilities.capice_imputing import CapiceImputing
 from molgenis.capice.utilities.manual_vep_processor import ManualVEPProcessor
 from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor
 from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator
@@ -74,16 +73,6 @@ def process(loaded_data):
         processed_data = processor.process(dataset=loaded_data)
         return processed_data
 
-    @staticmethod
-    def impute(loaded_data, impute_values):
-        """
-        Function to perform imputing over the loaded data.
-        self.model can be None, but impute_json has to be defined in that case.
-        """
-        capice_imputer = CapiceImputing(impute_values=impute_values)
-        capice_data = capice_imputer.impute(loaded_data)
-        return capice_data
-
     def preprocess(self, loaded_data, model_features=None):
         """
         Function to perform the preprocessing of the loaded data to convert

diff --git a/src/molgenis/capice/main_predict.py b/src/molgenis/capice/main_predict.py
@@ -27,7 +27,6 @@ def run(self):
                                                                     Column.feature.value,
                                                                     Column.feature_type.value])
         capice_data = self.process(loaded_data=capice_data)
-        capice_data = self.impute(loaded_data=capice_data, impute_values=self.model.impute_values)
         capice_data = self.preprocess(loaded_data=capice_data,
                                       model_features=self.model.get_booster().feature_names)
         capice_data = self.predict(loaded_data=capice_data)

diff --git a/src/molgenis/capice/main_train.py b/src/molgenis/capice/main_train.py
@@ -59,13 +59,12 @@ def run(self):
             json_dict = json.load(impute_values_file)
         self._validate_impute_complete(data, json_dict)
 
-        imputed_data = self.impute(loaded_data=data, impute_values=json_dict)
-        processed_data = self.preprocess(loaded_data=imputed_data)
+        processed_data = self.preprocess(loaded_data=data)
         self._get_processed_features(dataset=processed_data, impute_keys=json_dict.keys())
         processed_train, processed_test = self.split_data(dataset=processed_data,
                                                           test_size=self.train_test_size)
         model = self.train(test_set=processed_test, train_set=processed_train)
-        setattr(model, "impute_values", json_dict)
+        setattr(model, "model_features", list(json_dict.keys()))
         setattr(model, 'CAPICE_version', __version__)
         self.exporter.export_capice_model(model=model)
 

diff --git a/src/molgenis/capice/utilities/capice_imputing.py b/src/molgenis/capice/utilities/capice_imputing.py
diff --git a/src/molgenis/capice/validators/model_validator.py b/src/molgenis/capice/validators/model_validator.py
@@ -16,7 +16,7 @@ def validate_has_required_attributes(model):
         Function to validate if the required attributes CAPICE_version,
         impute_values and predict_proba are present.
         """
-        required_attributes = ['CAPICE_version', 'impute_values', 'predict_proba']
+        required_attributes = ['CAPICE_version', 'model_features', 'predict_proba']
         for attribute in required_attributes:
             if attribute not in dir(model):
                 raise AttributeError(f'Unable to locate attribute {attribute} in model file!')
diff --git a/src/molgenis/capice/validators/post_vep_processing_validator.py b/src/molgenis/capice/validators/post_vep_processing_validator.py
@@ -13,7 +13,7 @@ def validate_features_present(self, datafile):
         presently processed.
         """
         column_utils = ColumnUtils()
-        column_utils.set_specified_columns(self.model.impute_values.keys())
+        column_utils.set_specified_columns(self.model.model_features)
         features_not_present = column_utils.get_missing_diff_with(datafile.columns)
         if len(features_not_present) > 0:
             error_message = 'Detected required feature(s) %s not ' \

diff --git a/src/molgenis/capice/vep/length.py b/src/molgenis/capice/vep/length.py
@@ -8,7 +8,7 @@ class Length(Template):
     def __init__(self):
         super(Length, self).__init__(
             name=Column.ref.value,
-            usable=False
+            usable=True
         )
 
     @property

diff --git a/src/molgenis/capice/vep/type.py b/src/molgenis/capice/vep/type.py
@@ -8,7 +8,7 @@ class Type(Template):
     def __init__(self):
         super(Type, self).__init__(
             name=Column.ref.value,
-            usable=False
+            usable=True
         )
 
     @property