Skip to content

Commit

Permalink
Merge pull request #153 from molgenis/fix/train_feature_json
Browse files Browse the repository at this point in the history
Fix/train feature json
  • Loading branch information
dennishendriksen authored Dec 21, 2022
2 parents a91aebe + 75dbec3 commit a8893fc
Show file tree
Hide file tree
Showing 39 changed files with 971 additions and 617 deletions.
61 changes: 9 additions & 52 deletions resources/train_features.json
Original file line number Diff line number Diff line change
@@ -1,54 +1,13 @@
{
"PolyPhenCat": null,
"PolyPhenVal": null,
"cDNApos": null,
"relcDNApos": null,
"SIFTcat": null,
"SIFTval": null,
"protPos": null,
"relProtPos": null,
"oAA": null,
"nAA": null,
"CDSpos": null,
"relCDSpos": null,
"ref": null,
"alt": null,
"is_regulatory_region_variant": null,
"is_regulatory_region_ablation": null,
"is_regulatory_region_amplification": null,
"is_missense_variant": null,
"is_intron_variant": null,
"is_upstream_gene_variant": null,
"is_downstream_gene_variant": null,
"is_synonymous_variant": null,
"is_TF_binding_site_variant": null,
"is_splice_donor_variant": null,
"is_coding_sequence_variant": null,
"is_splice_region_variant": null,
"is_stop_gained": null,
"is_splice_acceptor_variant": null,
"is_frameshift_variant": null,
"is_3_prime_UTR_variant": null,
"is_inframe_insertion": null,
"is_inframe_deletion": null,
"is_5_prime_UTR_variant": null,
"is_start_lost": null,
"is_non_coding_transcript_exon_variant": null,
"is_non_coding_transcript_variant": null,
"is_TFBS_ablation": null,
"is_TFBS_amplification": null,
"is_protein_altering_variant": null,
"is_stop_lost": null,
"is_stop_retained_variant": null,
"is_transcript_ablation": null,
"is_intergenic_variant": null,
"is_start_retained_variant": null,
"is_transcript_amplification": null,
"is_incomplete_terminal_codon_variant": null,
"is_mature_miRNA_variant": null,
"is_NMD_transcript_variant": null,
"is_feature_elongation": null,
"is_feature_truncation": null,
"PolyPhen": null,
"SIFT": null,
"cDNA_position": null,
"CDS_position": null,
"Protein_position": null,
"Amino_acids": null,
"REF": null,
"ALT": null,
"Consequence": null,
"SpliceAI_pred_DP_AG": null,
"SpliceAI_pred_DP_AL": null,
"SpliceAI_pred_DP_DG": null,
Expand All @@ -57,8 +16,6 @@
"SpliceAI_pred_DS_AL": null,
"SpliceAI_pred_DS_DG": null,
"SpliceAI_pred_DS_DL": null,
"Type": null,
"Length": null,
"Grantham": null,
"phyloP": null
}
1 change: 1 addition & 0 deletions src/molgenis/capice/cli/args_handler_parent.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def _retrieve_argument_from_list(self,
return self._single_argument_retriever(arg, arg_name, has_default)
except IOError as e:
self.parser.error(e)
return None

@staticmethod
def _single_argument_retriever(arg: list | None,
Expand Down
3 changes: 2 additions & 1 deletion src/molgenis/capice/cli/args_handler_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def _extension(self):

@property
def _model_extension(self) -> tuple[str]:
return '.json', '.ubj'
# Ignore because the amount of values of tuple does not matter.
return '.json', '.ubj' # type: ignore

def _model_extension_str(self) -> str:
return self._join_extensions(self._model_extension)
Expand Down
75 changes: 49 additions & 26 deletions src/molgenis/capice/main_capice.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import os
from abc import ABC, abstractmethod

import pandas as pd

from molgenis.capice.core.logger import Logger
from molgenis.capice.utilities.enums import Column
from molgenis.capice.core.capice_manager import CapiceManager
from molgenis.capice.utilities.input_parser import InputParser
from molgenis.capice.core.capice_exporter import CapiceExporter
from molgenis.capice.utilities.preprocessor import PreProcessor
from molgenis.capice.utilities.manual_vep_processor import ManualVEPProcessor
from molgenis.capice.utilities.categorical_processor import CategoricalProcessor
from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor
from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator

Expand Down Expand Up @@ -45,7 +48,7 @@ def __init__(self, input_path, output_path, output_given):
def run(self):
pass

def _load_file(self, additional_required_features: list = None):
def _load_file(self, additional_required_features: list | None = None):
"""
Function to load the input TSV file into main
:return: pandas DataFrame
Expand All @@ -66,35 +69,55 @@ def _load_file(self, additional_required_features: list = None):
return input_file

@staticmethod
def process(loaded_data):
def process(loaded_data: pd.DataFrame, process_features: list[str]) -> tuple[
pd.DataFrame, dict[str, list[str]]
]:
# Returns might look funky, but Google pydoc does not support multiple return statements.
"""
Function to process the VEP features to CAPICE features.
Function to call the ManualVEPProcessor over loaded_data using the supplied
process_features list.
Args:
loaded_data:
The pandas dataframe over which the VEP features should be processed.
process_features:
List containing either all input features, possibly containing VEP features (in
the case of train) or already all input features that can be VEP processed (in
case of predict).
Returns:
tuple:
Tuple [0] containing: The output dataframe containing all VEP processed features
according to process_features. Depending on the property "drop" will drop the
feature present in process_features from the columns of the output dataframe.
Tuple [1] containing: The output dictionary containing the VEP feature (key)
and the derivative features that originate from said VEP feature (value).
The property "drop" is of no influence here.
"""
processor = ManualVEPProcessor()
processed_data = processor.process(dataset=loaded_data)
return processed_data
processed_data = processor.process(loaded_data, process_features)
processed_features = processor.get_feature_processes()
# No validation, since that is specific to predict.
# Also predict doesn't technically need processed_features, but within predict the first
# argument in the tuple can just be indexed.
# Still returning both is relevant, in case we want to validate the processed_features in
# the future for predict.
return processed_data, processed_features

def preprocess(self, loaded_data, model_features=None):
"""
Function to perform the preprocessing of the loaded data to convert
categorical columns.
:param loaded_data: Pandas dataframe of the imputed CAPICE data
:param model_features: list (default None), a list containing all
the features present within a model file. When set to None,
PreProcessor will activate the train protocol.
Note: please adjust self.exclude_features: to include all of the
features that the preprocessor should NOT process.
Features chr_pos_ref_alt, chr and pos are hardcoded and
thus do not have to be included.
"""
preprocessor = PreProcessor(
exclude_features=self.exclude_features,
model_features=model_features)
capice_data = preprocessor.preprocess(loaded_data)
return capice_data
@staticmethod
def categorical_process(loaded_data: pd.DataFrame,
processing_features: dict[str, list[str]] | None = None,
train_features: list | None = None):
processor = CategoricalProcessor()
capice_data, processed_features = processor.process(
loaded_data,
processable_features=train_features,
predetermined_features=processing_features
)
return capice_data, processed_features

def _export(self, dataset, output):
def _export(self, dataset: pd.DataFrame, output: os.PathLike):
"""
Function to prepare the data to be exported
"""
Expand Down
29 changes: 16 additions & 13 deletions src/molgenis/capice/main_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
from molgenis.capice.utilities.enums import Column
from molgenis.capice.utilities.predictor import Predictor
from molgenis.capice.utilities.class_suggestor import ClassSuggestor
from molgenis.capice.validators.predict_validator import PredictValidator
from molgenis.capice.validators.post_vep_processing_validator import PostVEPProcessingValidator


class CapicePredict(Main):
"""
Predict class of CAPICE to call the different modules to impute,
preprocess and eventually predict a score over a CAPICE annotated file.
process and eventually predict a score over a CAPICE annotated file.
"""

def __init__(self, input_path, model, output_path, output_given):
Expand All @@ -26,27 +27,29 @@ def run(self):
Column.id_source.value,
Column.feature.value,
Column.feature_type.value])
capice_data = self.process(loaded_data=capice_data)
capice_data = self.preprocess(loaded_data=capice_data,
model_features=self.model.get_booster().feature_names)
capice_data = self.process(
loaded_data=capice_data,
process_features=list(self.model.vep_features.keys())
)[0]
PostVEPProcessingValidator().validate_features_present(
capice_data, self.model.vep_features.values()
)
capice_data = self.categorical_process(
loaded_data=capice_data,
processing_features=self.model.processable_features,
train_features=None
)[0]
capice_data = self.predict(loaded_data=capice_data)
capice_data = self.apply_suggested_class(predicted_data=capice_data)
self._export(dataset=capice_data, output=self.output)

def process(self, loaded_data):
"""
Function to process the VEP file to a CAPICE file
"""
processed_data = super().process(loaded_data)
validator = PostVEPProcessingValidator(self.model)
validator.validate_features_present(processed_data)
return processed_data

def predict(self, loaded_data):
"""
Function to call the correct model to predict CAPICE scores
:return: pandas DataFrame
"""
validator = PredictValidator()
validator.validate_data_predict_ready(loaded_data, self.model)
predictor = Predictor(self.model)
capice_data = predictor.predict(loaded_data)
return capice_data
Expand Down
Loading

0 comments on commit a8893fc

Please sign in to comment.