Merge pull request #153 from molgenis/fix/train_feature_json

Fix/train feature json
molgenis · Dec 21, 2022 · a8893fc · a8893fc
2 parents a91aebe + 75dbec3
commit a8893fc
Show file tree

Hide file tree

Showing 39 changed files with 971 additions and 617 deletions.
diff --git a/resources/train_features.json b/resources/train_features.json
@@ -1,54 +1,13 @@
 {
-    "PolyPhenCat": null,
-    "PolyPhenVal": null,
-    "cDNApos": null,
-    "relcDNApos": null,
-    "SIFTcat": null,
-    "SIFTval": null,
-    "protPos": null,
-    "relProtPos": null,
-    "oAA": null,
-    "nAA": null,
-    "CDSpos": null,
-    "relCDSpos": null,
-    "ref": null,
-    "alt": null,
-    "is_regulatory_region_variant": null,
-    "is_regulatory_region_ablation": null,
-    "is_regulatory_region_amplification": null,
-    "is_missense_variant": null,
-    "is_intron_variant": null,
-    "is_upstream_gene_variant": null,
-    "is_downstream_gene_variant": null,
-    "is_synonymous_variant": null,
-    "is_TF_binding_site_variant": null,
-    "is_splice_donor_variant": null,
-    "is_coding_sequence_variant": null,
-    "is_splice_region_variant": null,
-    "is_stop_gained": null,
-    "is_splice_acceptor_variant": null,
-    "is_frameshift_variant": null,
-    "is_3_prime_UTR_variant": null,
-    "is_inframe_insertion": null,
-    "is_inframe_deletion": null,
-    "is_5_prime_UTR_variant": null,
-    "is_start_lost": null,
-    "is_non_coding_transcript_exon_variant": null,
-    "is_non_coding_transcript_variant": null,
-    "is_TFBS_ablation": null,
-    "is_TFBS_amplification": null,
-    "is_protein_altering_variant": null,
-    "is_stop_lost": null,
-    "is_stop_retained_variant": null,
-    "is_transcript_ablation": null,
-    "is_intergenic_variant": null,
-    "is_start_retained_variant": null,
-    "is_transcript_amplification": null,
-    "is_incomplete_terminal_codon_variant": null,
-    "is_mature_miRNA_variant": null,
-    "is_NMD_transcript_variant": null,
-    "is_feature_elongation": null,
-    "is_feature_truncation": null,
+    "PolyPhen": null,
+    "SIFT": null,
+    "cDNA_position": null,
+    "CDS_position": null,
+    "Protein_position": null,
+    "Amino_acids": null,
+    "REF": null,
+    "ALT": null,
+    "Consequence": null,
     "SpliceAI_pred_DP_AG": null,
     "SpliceAI_pred_DP_AL": null,
     "SpliceAI_pred_DP_DG": null,
@@ -57,8 +16,6 @@
     "SpliceAI_pred_DS_AL": null,
     "SpliceAI_pred_DS_DG": null,
     "SpliceAI_pred_DS_DL": null,
-    "Type": null,
-    "Length": null,
     "Grantham": null,
     "phyloP": null
 }
diff --git a/src/molgenis/capice/cli/args_handler_parent.py b/src/molgenis/capice/cli/args_handler_parent.py
@@ -119,6 +119,7 @@ def _retrieve_argument_from_list(self,
             return self._single_argument_retriever(arg, arg_name, has_default)
         except IOError as e:
             self.parser.error(e)
+            return None
 
     @staticmethod
     def _single_argument_retriever(arg: list | None,

diff --git a/src/molgenis/capice/cli/args_handler_predict.py b/src/molgenis/capice/cli/args_handler_predict.py
@@ -20,7 +20,8 @@ def _extension(self):
 
     @property
     def _model_extension(self) -> tuple[str]:
-        return '.json', '.ubj'
+        # Ignore because the amount of values of tuple does not matter.
+        return '.json', '.ubj'  # type: ignore
 
     def _model_extension_str(self) -> str:
         return self._join_extensions(self._model_extension)

diff --git a/src/molgenis/capice/main_capice.py b/src/molgenis/capice/main_capice.py
@@ -1,12 +1,15 @@
+import os
 from abc import ABC, abstractmethod
 
+import pandas as pd
+
 from molgenis.capice.core.logger import Logger
 from molgenis.capice.utilities.enums import Column
 from molgenis.capice.core.capice_manager import CapiceManager
 from molgenis.capice.utilities.input_parser import InputParser
 from molgenis.capice.core.capice_exporter import CapiceExporter
-from molgenis.capice.utilities.preprocessor import PreProcessor
 from molgenis.capice.utilities.manual_vep_processor import ManualVEPProcessor
+from molgenis.capice.utilities.categorical_processor import CategoricalProcessor
 from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor
 from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator
 
@@ -45,7 +48,7 @@ def __init__(self, input_path, output_path, output_given):
     def run(self):
         pass
 
-    def _load_file(self, additional_required_features: list = None):
+    def _load_file(self, additional_required_features: list | None = None):
         """
         Function to load the input TSV file into main
         :return: pandas DataFrame
@@ -66,35 +69,55 @@ def _load_file(self, additional_required_features: list = None):
         return input_file
 
     @staticmethod
-    def process(loaded_data):
+    def process(loaded_data: pd.DataFrame, process_features: list[str]) -> tuple[
+        pd.DataFrame, dict[str, list[str]]
+    ]:
+        # Returns might look funky, but Google pydoc does not support multiple return statements.
         """
-        Function to process the VEP features to CAPICE features.
+        Function to call the ManualVEPProcessor over loaded_data using the supplied
+        process_features list.
+
+        Args:
+            loaded_data:
+                The pandas dataframe over which the VEP features should be processed.
+
+            process_features:
+                List containing either all input features, possibly containing VEP features (in
+                the case of train) or already all input features that can be VEP processed (in
+                case of predict).
+
+        Returns:
+            tuple:
+                Tuple [0] containing: The output dataframe containing all VEP processed features
+                according to process_features. Depending on the property "drop" will drop the
+                feature present in process_features from the columns of the output dataframe.
+                Tuple [1] containing: The output dictionary containing the VEP feature (key)
+                and the derivative features that originate from said VEP feature (value).
+                The property "drop" is of no influence here.
         """
         processor = ManualVEPProcessor()
-        processed_data = processor.process(dataset=loaded_data)
-        return processed_data
+        processed_data = processor.process(loaded_data, process_features)
+        processed_features = processor.get_feature_processes()
+        # No validation, since that is specific to predict.
+        # Also predict doesn't technically need processed_features, but within predict the first
+        # argument in the tuple can just be indexed.
+        # Still returning both is relevant, in case we want to validate the processed_features in
+        # the future for predict.
+        return processed_data, processed_features
 
-    def preprocess(self, loaded_data, model_features=None):
-        """
-        Function to perform the preprocessing of the loaded data to convert
-        categorical columns.
-        :param loaded_data: Pandas dataframe of the imputed CAPICE data
-        :param model_features: list (default None), a list containing all
-        the features present within a model file. When set to None,
-        PreProcessor will activate the train protocol.
-
-        Note: please adjust self.exclude_features: to include all of the
-        features that the preprocessor should NOT process.
-        Features chr_pos_ref_alt, chr and pos are hardcoded and
-        thus do not have to be included.
-        """
-        preprocessor = PreProcessor(
-            exclude_features=self.exclude_features,
-            model_features=model_features)
-        capice_data = preprocessor.preprocess(loaded_data)
-        return capice_data
+    @staticmethod
+    def categorical_process(loaded_data: pd.DataFrame,
+                            processing_features: dict[str, list[str]] | None = None,
+                            train_features: list | None = None):
+        processor = CategoricalProcessor()
+        capice_data, processed_features = processor.process(
+            loaded_data,
+            processable_features=train_features,
+            predetermined_features=processing_features
+        )
+        return capice_data, processed_features
 
-    def _export(self, dataset, output):
+    def _export(self, dataset: pd.DataFrame, output: os.PathLike):
         """
         Function to prepare the data to be exported
         """

diff --git a/src/molgenis/capice/main_predict.py b/src/molgenis/capice/main_predict.py
@@ -2,13 +2,14 @@
 from molgenis.capice.utilities.enums import Column
 from molgenis.capice.utilities.predictor import Predictor
 from molgenis.capice.utilities.class_suggestor import ClassSuggestor
+from molgenis.capice.validators.predict_validator import PredictValidator
 from molgenis.capice.validators.post_vep_processing_validator import PostVEPProcessingValidator
 
 
 class CapicePredict(Main):
     """
     Predict class of CAPICE to call the different modules to impute,
-    preprocess and eventually predict a score over a CAPICE annotated file.
+    process and eventually predict a score over a CAPICE annotated file.
     """
 
     def __init__(self, input_path, model, output_path, output_given):
@@ -26,27 +27,29 @@ def run(self):
                                                                     Column.id_source.value,
                                                                     Column.feature.value,
                                                                     Column.feature_type.value])
-        capice_data = self.process(loaded_data=capice_data)
-        capice_data = self.preprocess(loaded_data=capice_data,
-                                      model_features=self.model.get_booster().feature_names)
+        capice_data = self.process(
+            loaded_data=capice_data,
+            process_features=list(self.model.vep_features.keys())
+        )[0]
+        PostVEPProcessingValidator().validate_features_present(
+            capice_data, self.model.vep_features.values()
+        )
+        capice_data = self.categorical_process(
+            loaded_data=capice_data,
+            processing_features=self.model.processable_features,
+            train_features=None
+        )[0]
         capice_data = self.predict(loaded_data=capice_data)
         capice_data = self.apply_suggested_class(predicted_data=capice_data)
         self._export(dataset=capice_data, output=self.output)
 
-    def process(self, loaded_data):
-        """
-        Function to process the VEP file to a CAPICE file
-        """
-        processed_data = super().process(loaded_data)
-        validator = PostVEPProcessingValidator(self.model)
-        validator.validate_features_present(processed_data)
-        return processed_data
-
     def predict(self, loaded_data):
         """
         Function to call the correct model to predict CAPICE scores
         :return: pandas DataFrame
         """
+        validator = PredictValidator()
+        validator.validate_data_predict_ready(loaded_data, self.model)
         predictor = Predictor(self.model)
         capice_data = predictor.predict(loaded_data)
         return capice_data