diff --git a/resources/train_features.json b/resources/train_features.json index 6fa0312a..160a6d1a 100644 --- a/resources/train_features.json +++ b/resources/train_features.json @@ -1,54 +1,13 @@ { - "PolyPhenCat": null, - "PolyPhenVal": null, - "cDNApos": null, - "relcDNApos": null, - "SIFTcat": null, - "SIFTval": null, - "protPos": null, - "relProtPos": null, - "oAA": null, - "nAA": null, - "CDSpos": null, - "relCDSpos": null, - "ref": null, - "alt": null, - "is_regulatory_region_variant": null, - "is_regulatory_region_ablation": null, - "is_regulatory_region_amplification": null, - "is_missense_variant": null, - "is_intron_variant": null, - "is_upstream_gene_variant": null, - "is_downstream_gene_variant": null, - "is_synonymous_variant": null, - "is_TF_binding_site_variant": null, - "is_splice_donor_variant": null, - "is_coding_sequence_variant": null, - "is_splice_region_variant": null, - "is_stop_gained": null, - "is_splice_acceptor_variant": null, - "is_frameshift_variant": null, - "is_3_prime_UTR_variant": null, - "is_inframe_insertion": null, - "is_inframe_deletion": null, - "is_5_prime_UTR_variant": null, - "is_start_lost": null, - "is_non_coding_transcript_exon_variant": null, - "is_non_coding_transcript_variant": null, - "is_TFBS_ablation": null, - "is_TFBS_amplification": null, - "is_protein_altering_variant": null, - "is_stop_lost": null, - "is_stop_retained_variant": null, - "is_transcript_ablation": null, - "is_intergenic_variant": null, - "is_start_retained_variant": null, - "is_transcript_amplification": null, - "is_incomplete_terminal_codon_variant": null, - "is_mature_miRNA_variant": null, - "is_NMD_transcript_variant": null, - "is_feature_elongation": null, - "is_feature_truncation": null, + "PolyPhen": null, + "SIFT": null, + "cDNA_position": null, + "CDS_position": null, + "Protein_position": null, + "Amino_acids": null, + "REF": null, + "ALT": null, + "Consequence": null, "SpliceAI_pred_DP_AG": null, "SpliceAI_pred_DP_AL": null, "SpliceAI_pred_DP_DG": null, @@ -57,8 +16,6 @@ "SpliceAI_pred_DS_AL": null, "SpliceAI_pred_DS_DG": null, "SpliceAI_pred_DS_DL": null, - "Type": null, - "Length": null, "Grantham": null, "phyloP": null } \ No newline at end of file diff --git a/src/molgenis/capice/cli/args_handler_parent.py b/src/molgenis/capice/cli/args_handler_parent.py index 0abbf389..a0ba0fc9 100644 --- a/src/molgenis/capice/cli/args_handler_parent.py +++ b/src/molgenis/capice/cli/args_handler_parent.py @@ -119,6 +119,7 @@ def _retrieve_argument_from_list(self, return self._single_argument_retriever(arg, arg_name, has_default) except IOError as e: self.parser.error(e) + return None @staticmethod def _single_argument_retriever(arg: list | None, diff --git a/src/molgenis/capice/cli/args_handler_predict.py b/src/molgenis/capice/cli/args_handler_predict.py index 8389ec4b..df8c19e8 100644 --- a/src/molgenis/capice/cli/args_handler_predict.py +++ b/src/molgenis/capice/cli/args_handler_predict.py @@ -20,7 +20,8 @@ def _extension(self): @property def _model_extension(self) -> tuple[str]: - return '.json', '.ubj' + # Ignore because the amount of values of tuple does not matter. + return '.json', '.ubj' # type: ignore def _model_extension_str(self) -> str: return self._join_extensions(self._model_extension) diff --git a/src/molgenis/capice/main_capice.py b/src/molgenis/capice/main_capice.py index c7ca7e88..d71d0c3b 100644 --- a/src/molgenis/capice/main_capice.py +++ b/src/molgenis/capice/main_capice.py @@ -1,12 +1,15 @@ +import os from abc import ABC, abstractmethod +import pandas as pd + from molgenis.capice.core.logger import Logger from molgenis.capice.utilities.enums import Column from molgenis.capice.core.capice_manager import CapiceManager from molgenis.capice.utilities.input_parser import InputParser from molgenis.capice.core.capice_exporter import CapiceExporter -from molgenis.capice.utilities.preprocessor import PreProcessor from molgenis.capice.utilities.manual_vep_processor import ManualVEPProcessor +from molgenis.capice.utilities.categorical_processor import CategoricalProcessor from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator @@ -45,7 +48,7 @@ def __init__(self, input_path, output_path, output_given): def run(self): pass - def _load_file(self, additional_required_features: list = None): + def _load_file(self, additional_required_features: list | None = None): """ Function to load the input TSV file into main :return: pandas DataFrame @@ -66,35 +69,55 @@ def _load_file(self, additional_required_features: list = None): return input_file @staticmethod - def process(loaded_data): + def process(loaded_data: pd.DataFrame, process_features: list[str]) -> tuple[ + pd.DataFrame, dict[str, list[str]] + ]: + # Returns might look funky, but Google pydoc does not support multiple return statements. """ - Function to process the VEP features to CAPICE features. + Function to call the ManualVEPProcessor over loaded_data using the supplied + process_features list. + + Args: + loaded_data: + The pandas dataframe over which the VEP features should be processed. + + process_features: + List containing either all input features, possibly containing VEP features (in + the case of train) or already all input features that can be VEP processed (in + case of predict). + + Returns: + tuple: + Tuple [0] containing: The output dataframe containing all VEP processed features + according to process_features. Depending on the property "drop" will drop the + feature present in process_features from the columns of the output dataframe. + Tuple [1] containing: The output dictionary containing the VEP feature (key) + and the derivative features that originate from said VEP feature (value). + The property "drop" is of no influence here. """ processor = ManualVEPProcessor() - processed_data = processor.process(dataset=loaded_data) - return processed_data + processed_data = processor.process(loaded_data, process_features) + processed_features = processor.get_feature_processes() + # No validation, since that is specific to predict. + # Also predict doesn't technically need processed_features, but within predict the first + # argument in the tuple can just be indexed. + # Still returning both is relevant, in case we want to validate the processed_features in + # the future for predict. + return processed_data, processed_features - def preprocess(self, loaded_data, model_features=None): - """ - Function to perform the preprocessing of the loaded data to convert - categorical columns. - :param loaded_data: Pandas dataframe of the imputed CAPICE data - :param model_features: list (default None), a list containing all - the features present within a model file. When set to None, - PreProcessor will activate the train protocol. - - Note: please adjust self.exclude_features: to include all of the - features that the preprocessor should NOT process. - Features chr_pos_ref_alt, chr and pos are hardcoded and - thus do not have to be included. - """ - preprocessor = PreProcessor( - exclude_features=self.exclude_features, - model_features=model_features) - capice_data = preprocessor.preprocess(loaded_data) - return capice_data + @staticmethod + def categorical_process(loaded_data: pd.DataFrame, + processing_features: dict[str, list[str]] | None = None, + train_features: list | None = None): + processor = CategoricalProcessor() + capice_data, processed_features = processor.process( + loaded_data, + processable_features=train_features, + predetermined_features=processing_features + ) + return capice_data, processed_features - def _export(self, dataset, output): + def _export(self, dataset: pd.DataFrame, output: os.PathLike): """ Function to prepare the data to be exported """ diff --git a/src/molgenis/capice/main_predict.py b/src/molgenis/capice/main_predict.py index 51106adf..1c376c56 100644 --- a/src/molgenis/capice/main_predict.py +++ b/src/molgenis/capice/main_predict.py @@ -2,13 +2,14 @@ from molgenis.capice.utilities.enums import Column from molgenis.capice.utilities.predictor import Predictor from molgenis.capice.utilities.class_suggestor import ClassSuggestor +from molgenis.capice.validators.predict_validator import PredictValidator from molgenis.capice.validators.post_vep_processing_validator import PostVEPProcessingValidator class CapicePredict(Main): """ Predict class of CAPICE to call the different modules to impute, - preprocess and eventually predict a score over a CAPICE annotated file. + process and eventually predict a score over a CAPICE annotated file. """ def __init__(self, input_path, model, output_path, output_given): @@ -26,27 +27,29 @@ def run(self): Column.id_source.value, Column.feature.value, Column.feature_type.value]) - capice_data = self.process(loaded_data=capice_data) - capice_data = self.preprocess(loaded_data=capice_data, - model_features=self.model.get_booster().feature_names) + capice_data = self.process( + loaded_data=capice_data, + process_features=list(self.model.vep_features.keys()) + )[0] + PostVEPProcessingValidator().validate_features_present( + capice_data, self.model.vep_features.values() + ) + capice_data = self.categorical_process( + loaded_data=capice_data, + processing_features=self.model.processable_features, + train_features=None + )[0] capice_data = self.predict(loaded_data=capice_data) capice_data = self.apply_suggested_class(predicted_data=capice_data) self._export(dataset=capice_data, output=self.output) - def process(self, loaded_data): - """ - Function to process the VEP file to a CAPICE file - """ - processed_data = super().process(loaded_data) - validator = PostVEPProcessingValidator(self.model) - validator.validate_features_present(processed_data) - return processed_data - def predict(self, loaded_data): """ Function to call the correct model to predict CAPICE scores :return: pandas DataFrame """ + validator = PredictValidator() + validator.validate_data_predict_ready(loaded_data, self.model) predictor = Predictor(self.model) capice_data = predictor.predict(loaded_data) return capice_data diff --git a/src/molgenis/capice/main_train.py b/src/molgenis/capice/main_train.py index 892eff77..5b20653d 100644 --- a/src/molgenis/capice/main_train.py +++ b/src/molgenis/capice/main_train.py @@ -1,5 +1,6 @@ import json +import numpy as np import pandas as pd import xgboost as xgb from scipy import stats @@ -7,6 +8,7 @@ from molgenis.capice.main_capice import Main from molgenis.capice import __version__ +from molgenis.capice.utilities import check_if_in_list from molgenis.capice.utilities.enums import TrainEnums from molgenis.capice.core.capice_exporter import CapiceExporter @@ -22,7 +24,7 @@ def __init__(self, input_path, json_path, test_split, output_path, output_given, # Impute JSON. self.json_path = json_path - self.log.debug('Input impute JSON confirmed: %s', self.json_path) + self.log.debug('Input model features JSON confirmed: %s', self.json_path) # Train test size. self.train_test_size = test_split @@ -45,7 +47,7 @@ def __init__(self, input_path, json_path, test_split, output_path, output_given, self.random_state = 45 self.split_random_state = 4 self.model_random_state = 0 - self.processed_features = [] + self.train_features = [] self.loglevel = self.manager.loglevel self.exporter = CapiceExporter(file_path=self.output, output_given=self.output_given) @@ -55,37 +57,97 @@ def run(self): order to create new CAPICE models. """ data = self._load_file(additional_required_features=self.additional_required) - data = self.process(loaded_data=data) with open(self.json_path, 'rt') as impute_values_file: - json_dict = json.load(impute_values_file) - self._validate_impute_complete(data, json_dict) + train_features = list(json.load(impute_values_file).keys()) + + self._validate_train_features_duplicates(train_features) + + self._validate_features_present(data, train_features) + + data, vep_processed = self.process( + loaded_data=data, + process_features=train_features + ) + + processable_features = self._reset_processing_features( + train_features, + vep_processed, + data.columns + ) + + processed_data, processed_features = self.categorical_process( + loaded_data=data, + train_features=processable_features, + processing_features=None + ) + + self._set_train_features(processable_features, processed_features) - processed_data = self.preprocess(loaded_data=data) - self._get_processed_features(dataset=processed_data, impute_keys=json_dict.keys()) processed_train, processed_test = self.split_data(dataset=processed_data, test_size=self.train_test_size) model = self.train(test_set=processed_test, train_set=processed_train) - setattr(model, "model_features", list(json_dict.keys())) + setattr(model, "vep_features", vep_processed) + setattr(model, "processable_features", processed_features) setattr(model, 'CAPICE_version', __version__) self.exporter.export_capice_model(model=model) - def _validate_impute_complete(self, dataset, json_dict): - """ - - :param pd.DataFrame dataset: - :param dict json_dict: - :return: - """ + def _validate_features_present(self, dataset, train_features) -> None: missing = [] - for key in json_dict.keys(): + for key in train_features: if key not in dataset.columns: missing.append(key) if len(missing) > 0: - error_message = 'Impute file missing needed columns for input file: %s' + error_message = 'Train features file missing needed columns for input file: %s' self.log.critical(error_message, missing) raise ValueError(error_message % missing) + def _validate_train_features_duplicates(self, input_train_features: list): + values, counts = np.unique(input_train_features, return_counts=True) + if counts[counts > 1].any(): + error_message = 'Detected duplicate features in user supplied train features: %s' + duplicates = ', '.join(values[counts > 1]) + self.log.critical(error_message, duplicates) + raise KeyError(error_message % duplicates) + + @staticmethod + def _reset_processing_features( + input_train_features: list, + vep_processed: dict, + vep_processed_dataframe_columns: pd.DataFrame.columns + ) -> list[str]: + feature_list = [] + # Adds the VEP input features to which the processor has property drop = False + for feature in vep_processed.keys(): + if feature in vep_processed_dataframe_columns: + feature_list.append(feature) + # Adds back the user input features, but avoiding adding duplicates and + # avoiding the features that had property drop = True + for feature in input_train_features: + if feature not in feature_list and feature not in vep_processed.keys(): + feature_list.append(feature) + # Extending the features with the VEP processors output features + # Has to be new list otherwise features from feature_list go missing + return_list = check_if_in_list(vep_processed.values(), feature_list) + # Merging back with feature_list + return_list.extend(feature_list) + return return_list + + def _set_train_features(self, processable_features: list, processed_features: dict) -> \ + None: + train_features = [] + for feature in processable_features: + if feature not in processed_features.keys(): + train_features.append(feature) + for feature_name, features in processed_features.items(): + for feature in features: + train_features.append(f'{feature_name}_{feature}') + self.log.info( + 'The following features have been selected for training: %s', + ', '.join(train_features) + ) + self.train_features = train_features + def split_data(self, dataset, test_size: float): """ Function to split any given dataset into 2 datasets using the test_size @@ -99,23 +161,6 @@ def split_data(self, dataset, test_size: float): random_state=self.split_random_state) return train, test - def _get_processed_features(self, dataset: pd.DataFrame, impute_keys): - """ - Function to save the columns of a dataset that have been processed and - thus are an output column of the CADD annotation. - :param dataset: pandas.DataFrame - """ - for column in dataset.columns: - for feature in impute_keys: - if (column == feature or column.startswith(feature)) and \ - column not in self.processed_features: - self.processed_features.append(column) - self.log.info( - 'The following features have been selected for training: %s', ', '.join( - self.processed_features - ) - ) - def _set_verbosity_from_log_level(self): """ Uses loglevel to set verbosity and xg boost verbosity @@ -141,7 +186,7 @@ def _create_eval_set(self, xgb_version, test_set): :return: a list with tuple with pandas Dataframe, pandas Series and possibly "test" eval_set """ - eval_data = [test_set[self.processed_features], + eval_data = [test_set[self.train_features], test_set[TrainEnums.binarized_label.value]] if int(xgb_version.split('.')[0]) < 1: eval_data.append('test') @@ -200,7 +245,7 @@ def train(self, test_set: pd.DataFrame, train_set: pd.DataFrame): eval_set = self._create_eval_set(xgb.__version__, test_set) self.log.info('Random search starting, please hold.') - randomised_search_cv.fit(train_set[self.processed_features], + randomised_search_cv.fit(train_set[self.train_features], train_set[TrainEnums.binarized_label.value], eval_set=eval_set, verbose=xgb_verbosity, diff --git a/src/molgenis/capice/utilities/__init__.py b/src/molgenis/capice/utilities/__init__.py index 605e46f2..71fd86b7 100644 --- a/src/molgenis/capice/utilities/__init__.py +++ b/src/molgenis/capice/utilities/__init__.py @@ -1,6 +1,7 @@ import functools import warnings from pathlib import Path +from collections.abc import Iterable def get_project_root_dir(): @@ -23,3 +24,31 @@ def new_func(*args, **kwargs): return func(*args, **kwargs) return new_func + + +def check_if_in_list(list_of_lists: list[list[object]], to_check_list: Iterable): + """ + Checks if the item within a list within a list of object value (can be int, str, float, + etc.) is within the to_check_list. If False: add to return list. If True: do not add to + return list. + + Args: + list_of_lists: + List containing lists of values (object). + These values are each independently checked if they are within the to_check_list. + If False: add to return list. If true: do not add to return list. + to_check_list: + Iterable over which the individual items of the list_of_lists should be checked. + + Returns: + list: + A single list containing all individual items of list_of_lists that did not occur in + to_check_list. + + """ + return_list = [] + for items in list_of_lists: + for item in items: + if item not in to_check_list: + return_list.append(item) + return return_list diff --git a/src/molgenis/capice/utilities/categorical_processor.py b/src/molgenis/capice/utilities/categorical_processor.py new file mode 100644 index 00000000..a9d8f85b --- /dev/null +++ b/src/molgenis/capice/utilities/categorical_processor.py @@ -0,0 +1,174 @@ +import numpy as np +import pandas as pd + +from molgenis.capice.core.logger import Logger +from molgenis.capice.core.capice_manager import CapiceManager +from molgenis.capice.utilities.column_utils import ColumnUtils +from molgenis.capice.utilities.enums import Column, UniqueSeparator + + +class CategoricalProcessor: + """ + Class to process the data before predicting or training to separate + categorical columns. + """ + + def __init__(self): + self.log = Logger().logger + self.manager = CapiceManager() + + def process( + self, + dataset: pd.DataFrame, + processable_features: list[str] | None = None, + predetermined_features: dict[str, list] | None = None + ) -> tuple[pd.DataFrame, dict[str, list]]: + """ + Callable method of CategoricalProcessor to start processing the categorical columns + of dataset according to either processable_features (in case of train) or + predetermined_features (in case of predict). + + Args: + dataset: + The dataset over which the categorical features should be processed. + processable_features: + list of processable train features that this module limits to. + If used in predict: set to None. + predetermined_features: + Dictionary of features as they are put into the CategoricalProcessor (key) and + the features they should end up being (values). + If used in train: set to None. + Returns: + processed_dataset: + Dataset with all "Other" dtype columns processed to new columns that have either + 0 or 1 depending on if that sample had the categorical feature or not. + processing_features: + Dictionary containing the columns (key) and the columns they turned into as + output (values). + Raises: + ValueError: + ValueError is raised when both processable_features and predetermined_features + are set to None. + """ + self.log.info('Starting processing categorical columns.') + self._validate_one_feature_list_present(processable_features, predetermined_features) + self._create_preservation_col(dataset) + if predetermined_features is None: + # Type ignore, else mypy takes issue with Typing since processable_features can be + # None, so it is considered Optional[list[str]] instead of list[str]. + processing_features = self._get_categorical_columns( + dataset, + processable_features # type: ignore + ) + else: + processing_features = predetermined_features + + processed_dataset = self._get_dummies(dataset, processing_features) + + self._ensure_columns_present(processed_dataset, processing_features) + + self.log.info('Successfully processed categorical data.') + return processed_dataset, processing_features + + def _validate_one_feature_list_present( + self, + processable_features: list | None, + predetermined_features: dict[str, list] | None + ) -> None: + if processable_features is None and predetermined_features is None: + error_message = 'Both processable_features and predetermined_features are not supplied!' + self.log.critical(error_message) + raise ValueError(error_message) + + @staticmethod + def _create_preservation_col(dataset: pd.DataFrame) -> None: + """ + Function to create the chr_pos_ref_alt column so that it doesn't get + lost in preprocessing. + :param dataset: unprocessed pandas DataFrame + :return: unprocessed pandas DataFrame + containing column 'chr_pos_ref_alt' + """ + dataset[Column.chr_pos_ref_alt.value] = dataset[ + [Column.chr.value, Column.pos.value, Column.ref.value, Column.alt.value] + ].astype(str).agg(UniqueSeparator.unique_separator.value.join, axis=1) + + def _get_categorical_columns(self, dataset: pd.DataFrame, + processable_features: list[str]) -> dict[str, list]: + """ + Method for when the predetermined_features is None, usually in case of train, + to determine the top 5 features that should be used for pandas.get_dummies(). + Loops through the "Object" dtype features in dataset, and if the feature is in the + processable_features list it will obtain the most common values of the feature. + """ + processing_features = {} + for feature in dataset.select_dtypes(include=["O"]).columns: + if feature in processable_features: + self.log.debug('Converting the categorical column: %s', feature) + processing_features[feature] = self._process_object(dataset[feature]) + return processing_features + + def _process_object(self, feature_column: pd.Series) -> list: + """ + Method to call the top X categories method with return number 5 + """ + top_categories = self._get_top_n_cats(feature_column, return_num=5) + return top_categories + + def _get_top_n_cats(self, column: pd.Series, return_num: int) -> list: + """ + Function for when a training file is preprocessed to get the top + return_num quantity values within a categorical column. + Some converting is done for the logger to be able to print them. + :param column: pandas Series + :param return_num: integer + :return: pandas Series + """ + counts = column.value_counts().index + value_counts = list(counts[:return_num]) + if len(counts) > return_num: + value_counts.append(Column.other.value) + message = 'For feature: %s saved the following values: %s' + self.log.info(message, column.name, ', '.join(value_counts)) + return value_counts + + def _get_dummies(self, dataset: pd.DataFrame, processing_features: dict) -> pd.DataFrame: + """ + Method to call the pandas.get_dummies() method to convert categorical into multiple columns + of 0 and 1. + """ + for feature, feature_values in processing_features.items(): + self._set_other_values(dataset, feature, feature_values) + processed_dataset = pd.get_dummies(dataset, columns=processing_features.keys()) + return processed_dataset + + def _set_other_values(self, dataset: pd.DataFrame, feature_name: str, feature_values: list) -> \ + None: + """ + Method to set all other entries within a column to "other" if they are not in + feature_values list. + """ + self.log.debug( + 'Converting %d features for feature: %s : %s', + len(feature_values), + feature_name, + ", ".join(feature_values) + ) + dataset[feature_name] = np.where( + dataset[feature_name].isin(feature_values), + dataset[feature_name], + Column.other.value + ) + + def _ensure_columns_present(self, dataset: pd.DataFrame, categorical_out_columns: dict) -> None: + merged_columns = [] + for main_feature, derivative_features in categorical_out_columns.items(): + for feature in derivative_features: + merged_columns.append(f'{main_feature}_{feature}') + column_utils = ColumnUtils() + column_utils.set_specified_columns(merged_columns) + missing = column_utils.get_missing_diff_with(dataset.columns) + for feature in missing: + message = 'Detected column %s not present in columns. Adding full column on NaN' + self.log.debug(message, feature) + dataset[feature] = np.nan diff --git a/src/molgenis/capice/utilities/dynamic_loader.py b/src/molgenis/capice/utilities/dynamic_loader.py index cfa98da6..3f707510 100644 --- a/src/molgenis/capice/utilities/dynamic_loader.py +++ b/src/molgenis/capice/utilities/dynamic_loader.py @@ -21,7 +21,7 @@ def __init__(self, required_attributes: list, path): self.path = path self._check_dir_exists() self.required_attributes = required_attributes - self.modules = {} + self.modules: dict[str, object] = {} def load_manual_annotators(self): """ @@ -82,14 +82,15 @@ def _load_modules_from_path(path): modules.append(module) return modules - def _import(self, usable_modules: list): + def _import(self, usable_modules: list[str]) -> dict[str, object]: """ Function to dynamically load in the modules using the import_module library. :param usable_modules: list of absolute paths to potential modules :return: list of usable modules """ - return_modules = {} + # For some reason, mypy wants this line to be Typed instead of the method. + return_modules: dict[str, object] = {} for module in usable_modules: name = os.path.basename(module).split('.py')[0] spec = util.spec_from_file_location(name=name, location=module) diff --git a/src/molgenis/capice/utilities/enums.py b/src/molgenis/capice/utilities/enums.py index 88244e64..1cb84d22 100644 --- a/src/molgenis/capice/utilities/enums.py +++ b/src/molgenis/capice/utilities/enums.py @@ -8,8 +8,8 @@ class Column(Enum): chr_pos_ref_alt = 'chr_pos_ref_alt' chr = 'chr' pos = 'pos' - ref = 'ref' - alt = 'alt' + ref = 'REF' + alt = 'ALT' gene_name = 'gene_name' gene_id = 'gene_id' id_source = 'id_source' @@ -17,6 +17,7 @@ class Column(Enum): feature_type = 'feature_type' score = 'score' suggested_class = 'suggested_class' + other = 'other_CAPICE_value' class OutputClasses(Enum): diff --git a/src/molgenis/capice/utilities/load_file_postprocessor.py b/src/molgenis/capice/utilities/load_file_postprocessor.py index 9f3b5366..251026a8 100644 --- a/src/molgenis/capice/utilities/load_file_postprocessor.py +++ b/src/molgenis/capice/utilities/load_file_postprocessor.py @@ -33,8 +33,6 @@ def _col_renamer(self): self.dataset.rename( columns={'CHROM': Column.chr.value, 'POS': Column.pos.value, - 'REF': Column.ref.value, - 'ALT': Column.alt.value, 'Gene': Column.gene_id.value, 'SYMBOL_SOURCE': Column.id_source.value, 'Feature': Column.feature.value, diff --git a/src/molgenis/capice/utilities/manual_vep_processor.py b/src/molgenis/capice/utilities/manual_vep_processor.py index 45cc8487..b168cdd7 100644 --- a/src/molgenis/capice/utilities/manual_vep_processor.py +++ b/src/molgenis/capice/utilities/manual_vep_processor.py @@ -15,22 +15,34 @@ class ManualVEPProcessor: def __init__(self): self.log = Logger().logger + self.feature_processing_tracker = {} - def process(self, dataset: pd.DataFrame): + def process(self, dataset: pd.DataFrame, process_features: list[str]) -> pd.DataFrame: """ Callable method for the ManualVEPProcessor to start processing. Loads all the VEP processors dynamically from /src/main/python/vep. - :param dataset: pandas.DataFrame: loaded pandas dataframe of the user - provided input TSV. - :return: pandas.DataFrame: dataframe with processed features + + Args: + dataset: The input dataset over which the VEP features should be processed. + process_features: A collection of all input features that should be used in either + training or predicting over which VEP processing should happen. + + Returns: + pandas.DataFrame: The input dataset, processed on the consequences + """ self.log.info('Starting manual VEP feature processing.') vep_annotators = self._load_vep_processors() dropping_columns = [] n_feats_processed = 0 for processor in vep_annotators: - if processor.name in dataset.columns and processor.usable: + if ( + processor.name in dataset.columns and + processor.name in process_features and + processor.usable + ): self.log.debug('Processing: %s', processor.name) + self._add_feature_tracking(processor.name, processor.columns) dataset = processor.process(dataset) if processor.drop and processor.name not in dropping_columns: dropping_columns.append(processor.name) @@ -43,6 +55,22 @@ def process(self, dataset: pd.DataFrame): self.log.debug('Processed %d features.', n_feats_processed) return dataset + def _add_feature_tracking(self, processor_name: str, processor_features: list[str]): + if processor_name not in self.feature_processing_tracker.keys(): + self.feature_processing_tracker[processor_name] = processor_features + else: + self.feature_processing_tracker[processor_name].extend(processor_features) + + def get_feature_processes(self) -> dict[str, list[str]]: + """ + Getter for the dictionary containing all the processed features and their output features. + + Returns: + dict: + Input VEP processing features (key) and their output features (values) + """ + return self.feature_processing_tracker + def _load_vep_processors(self): location = os.path.join(get_project_root_dir(), 'vep') self.log.debug('Loading modules at %s', location) diff --git a/src/molgenis/capice/utilities/preprocessor.py b/src/molgenis/capice/utilities/preprocessor.py deleted file mode 100644 index 42bd45a9..00000000 --- a/src/molgenis/capice/utilities/preprocessor.py +++ /dev/null @@ -1,194 +0,0 @@ -import numpy as np -import pandas as pd - -from molgenis.capice.core.logger import Logger -from molgenis.capice.core.capice_manager import CapiceManager -from molgenis.capice.utilities.enums import Column, UniqueSeparator -from molgenis.capice.utilities.column_utils import ColumnUtils - - -class PreProcessor: - """ - Class to preprocess the data before predicting or training to separate - categorical columns. - """ - - def __init__(self, exclude_features: list, model_features: list = None): - """ - :param exclude_features: list, - all the features that the preprocessor should not process. - Features that are already excluded include: - chr_pos_ref_alt, chr and pos. - :param model_features: list (default None), a list containing all - the features present within a model file. - """ - self.log = Logger().logger - self.manager = CapiceManager() - self.log.info('Preprocessor started.') - self.train = False - self.exclude_features = [ - Column.chr_pos_ref_alt.value, - Column.chr.value, - Column.pos.value - ] - self.exclude_features += exclude_features - self.model_features = model_features - self.objects = [] - - def _is_train(self): - if self.model_features is None: - self.train = True - - def preprocess(self, dataset: pd.DataFrame): - """ - Callable function for the preprocessor to start preprocessing. - :param dataset: unprocessed pandas DataFrame - :return: processed pandas Dataframe - """ - self._is_train() - dataset = self._create_preservation_col(dataset) - self._get_categorical_columns(dataset) - processed_dataset = self._process_objects(dataset) - if not self.train: - processed_dataset = self._ensure_columns_present(processed_dataset) - self.log.info('Successfully preprocessed data.') - return processed_dataset - - @staticmethod - def _create_preservation_col(dataset): - """ - Function to create the chr_pos_ref_alt column so that it doesn't get - lost in preprocessing. - :param dataset: unprocessed pandas DataFrame - :return: unprocessed pandas DataFrame - containing column 'chr_pos_ref_alt' - """ - dataset[Column.chr_pos_ref_alt.value] = dataset[ - [Column.chr.value, Column.pos.value, Column.ref.value, Column.alt.value] - ].astype(str).agg(UniqueSeparator.unique_separator.value.join, axis=1) - return dataset - - def _get_categorical_columns(self, dataset: pd.DataFrame): - """ - Function to get the categorical columns that are within the supplied - annotation features of the imputing file. - :param dataset: pandas DataFrame - """ - for feature in dataset.select_dtypes(include=["O"]).columns: - if feature not in self.exclude_features: - self.objects.append(feature) - self.log.debug('Converting the categorical columns: %s.', ', '.join(self.objects)) - - def _process_objects(self, dataset: pd.DataFrame): - """ - (If train) will create a dictionary telling the processor how many - categories are within a certain column. - If not train: Will look up each annotation feature from the impute file - within the columns of the datafile (either in full name or the column - starts with the feature from the impute file). - This dictionary is then passed to the actual processor. - :param dataset: unprocessed pandas DataFrame - :return: processed pandas DataFrame - """ - annotation_feats_dict = {} - if self.train: - hardcoded_features = [Column.ref.value, Column.alt.value] - for feature in hardcoded_features: - annotation_feats_dict[feature] = 5 - self.log.info('Training protocol, creating new categorical conversion identifiers.') - for feat in self.objects: - if feat not in annotation_feats_dict.keys(): - annotation_feats_dict[feat] = 5 - else: - for feature in self.objects: - annotation_feats_dict = self._process_objects_no_train( - feature=feature, - annotation_features_dict=annotation_feats_dict - ) - processed_data = self._process_categorical_vars( - dataset=dataset, - annotation_feats_dict=annotation_feats_dict - ) - return processed_data - - def _process_objects_no_train(self, feature: str, annotation_features_dict: dict): - for model_feature in self.model_features: - if model_feature.startswith(feature): - extension = model_feature.split(''.join([feature, '_']))[-1] - if feature in annotation_features_dict.keys(): - annotation_features_dict[feature].append(extension) - else: - annotation_features_dict[feature] = [extension] - return annotation_features_dict - - def _process_categorical_vars(self, dataset: pd.DataFrame, annotation_feats_dict: dict): - """ - Processor of categorical columns. Will create new columns based on the - quantity of a value within a column. - :param dataset: unprocessed pandas DataFrame - :param annotation_feats_dict: - dictionary that is to contain the levels for each categorical - feature - :return: processed pandas DataFrame - """ - if self.train: - for annotation_feature in annotation_feats_dict.keys(): - feature_names = self._get_top_n_cats( - column=dataset[annotation_feature], - return_num=annotation_feats_dict[annotation_feature] - ) - dataset[annotation_feature] = np.where( - dataset[annotation_feature].isin(feature_names), - dataset[annotation_feature], - 'other' - ) - else: - for annotation_feature in annotation_feats_dict.keys(): - feature_names = annotation_feats_dict[annotation_feature] - self.log.debug('For feature: %s loaded %s levels: %s', - annotation_feature, - len(feature_names), - feature_names - ) - dataset[annotation_feature] = np.where( - dataset[annotation_feature].isin(feature_names), - dataset[annotation_feature], - 'other' - ) - dataset = pd.get_dummies(dataset, columns=list(annotation_feats_dict.keys())) - - return dataset - - def _get_top_n_cats(self, column: pd.Series, return_num: int): - """ - Function for when a training file is preprocessed to get the top - return_num quantity values within a categorical column. - Some converting is done for the logger to be able to print them. - :param column: pandas Series - :param return_num: integer - :return: pandas Series - """ - value_counts = column.value_counts().index[:return_num].values - printable_value_counts = [] - for value in value_counts: - if not isinstance(value, str): - value = str(value) - printable_value_counts.append(value) - message = 'For feature: %s saved the following values: %s' - self.log.info(message, column.name, ', '.join(printable_value_counts)) - return value_counts - - def _ensure_columns_present(self, dataset): - """ - Function to ensure that for the prediction all prediction columns - are present. If a columns is not present, add it with a full - columns of NaN. - """ - column_utils = ColumnUtils() - column_utils.set_specified_columns(self.model_features) - missing = column_utils.get_missing_diff_with(dataset.columns) - for feature in missing: - message = 'Detected column %s not present in columns. Adding full column of NaN' - self.log.debug(message, feature) - dataset[feature] = np.nan - return dataset diff --git a/src/molgenis/capice/validators/input_validator.py b/src/molgenis/capice/validators/input_validator.py index 9c4051bc..0078f195 100644 --- a/src/molgenis/capice/validators/input_validator.py +++ b/src/molgenis/capice/validators/input_validator.py @@ -16,7 +16,7 @@ def validate_input_path(input_path: os.PathLike, extension: tuple[str]): """ if not os.path.exists(input_path): raise FileNotFoundError(f'{input_path} does not exist!') - if not (input_path.endswith(extension)): + if not str(input_path).endswith(extension): raise IOError(f'{input_path} does not match required extension: ' f'{", ".join(extension)}') diff --git a/src/molgenis/capice/validators/model_validator.py b/src/molgenis/capice/validators/model_validator.py index 3a3934f3..bb641641 100644 --- a/src/molgenis/capice/validators/model_validator.py +++ b/src/molgenis/capice/validators/model_validator.py @@ -5,7 +5,8 @@ def validate_has_required_attributes(model): Function to validate if the required attributes CAPICE_version, impute_values and predict_proba are present. """ - required_attributes = ['CAPICE_version', 'model_features', 'predict_proba'] + required_attributes = ['CAPICE_version', 'vep_features', + 'processable_features', 'predict_proba'] for attribute in required_attributes: if attribute not in dir(model): raise AttributeError(f'Unable to locate attribute {attribute} in model file!') diff --git a/src/molgenis/capice/validators/post_file_parse_validator.py b/src/molgenis/capice/validators/post_file_parse_validator.py index 78fdab67..27ef95e7 100644 --- a/src/molgenis/capice/validators/post_file_parse_validator.py +++ b/src/molgenis/capice/validators/post_file_parse_validator.py @@ -14,7 +14,7 @@ def validate_n_columns(self, dataset): Validator to make sure that at least 4 columns are loaded (chr, pos, ref, alt). Does NOT check for the names of these columns! """ - if isinstance(dataset, pd.Series) or not dataset.shape[1] >= 4: + if isinstance(dataset, pd.Series) or dataset.shape[1] < 4: error_message = 'Loaded dataset does NOT have enough features! ' \ 'Is there a header present that does not start ' \ 'with ##?' @@ -31,7 +31,7 @@ def validate_variants_present(self, dataset): raise ValueError(error_message) def validate_minimally_required_columns( - self, dataset, additional_required_features: list = None + self, dataset, additional_required_features: list | None = None ): """ Validator for both predict and train to check if the very least columns diff --git a/src/molgenis/capice/validators/post_vep_processing_validator.py b/src/molgenis/capice/validators/post_vep_processing_validator.py index 738fdf8e..712dfec7 100644 --- a/src/molgenis/capice/validators/post_vep_processing_validator.py +++ b/src/molgenis/capice/validators/post_vep_processing_validator.py @@ -1,20 +1,29 @@ +import pandas as pd + from molgenis.capice.core.logger import Logger -from molgenis.capice.utilities.column_utils import ColumnUtils +from molgenis.capice.utilities import check_if_in_list class PostVEPProcessingValidator: - def __init__(self, model): - self.model = model + def __init__(self): self.log = Logger().logger - def validate_features_present(self, datafile): + def validate_features_present(self, datafile: pd.DataFrame, vep_features: list[list[str]]) -> \ + None: """ - Validator to see if all features within the model impute values are - presently processed. + Validator to see if all features that should be present after the + ManualVEPProcessor are present. + Args: + datafile: + Pandas Dataframe over which the feature presence validation should happen. + vep_features: + List of lists of expected output ManualVEPProcesing features as saved in the + model.vep_features.values() + Raises: + KeyError: + Raises KeyError when output VEP feature is not present within datafile. """ - column_utils = ColumnUtils() - column_utils.set_specified_columns(self.model.model_features) - features_not_present = column_utils.get_missing_diff_with(datafile.columns) + features_not_present = check_if_in_list(vep_features, datafile.columns) if len(features_not_present) > 0: error_message = 'Detected required feature(s) %s not ' \ 'present within VEP processed input file!' diff --git a/src/molgenis/capice/validators/predict_validator.py b/src/molgenis/capice/validators/predict_validator.py new file mode 100644 index 00000000..18c8a74e --- /dev/null +++ b/src/molgenis/capice/validators/predict_validator.py @@ -0,0 +1,32 @@ +import pandas as pd +import xgboost as xgb + +from molgenis.capice.core.logger import Logger + + +class PredictValidator: + def __init__(self): + self.log = Logger().logger + + def validate_data_predict_ready(self, dataset: pd.DataFrame, model: xgb.XGBClassifier) ->\ + None: + """ + Validates if dataset is predict ready according to the feature names in model + + Args: + dataset: + The dataset that is supposed to be predict ready. + model: + The custom CAPICE xgboost.XGBClassifier. + Raises: + KeyError: + Raised when a required predict feature is missing from dataset. + """ + missing = [] + for feature in model.get_booster().feature_names: # type: ignore + if feature not in dataset.columns: + missing.append(feature) + if len(missing) > 0: + error_message = 'Missing required predict column(s): %s' + self.log.critical(error_message, ', '.join(missing)) + raise KeyError(error_message, ', '.join(missing)) diff --git a/src/molgenis/capice/validators/property_type_validator.py b/src/molgenis/capice/validators/property_type_validator.py index 1a1271e8..615682c5 100644 --- a/src/molgenis/capice/validators/property_type_validator.py +++ b/src/molgenis/capice/validators/property_type_validator.py @@ -1,5 +1,5 @@ class PropertyTypeValidator: - def validate_property(self, value: any, expected_type: any, include_none: bool = False): + def validate_property(self, value: object, expected_type: type, include_none: bool = False): """ Logger method to raise a TypeError when a Property is not set correctly. diff --git a/src/molgenis/capice/validators/version_validator.py b/src/molgenis/capice/validators/version_validator.py index 958f2053..55e0b1d0 100644 --- a/src/molgenis/capice/validators/version_validator.py +++ b/src/molgenis/capice/validators/version_validator.py @@ -79,16 +79,18 @@ def validate_versions_compatible(self, capice_version: str, model_version: str): ValueError Raised when the model and framework versions are not compatible. """ + # All mypy ignores here are because attributes are not found. capice = match(self.regex, capice_version) model = match(self.regex, model_version) - if capice.group('major') != model.group('major'): + if capice.group('major') != model.group('major'): # type: ignore raise ValueError( - f'CAPICE major version {capice.string} does not match with the model ' - f'{model.string}!' + f'CAPICE major version {capice.string} ' # type: ignore + f'does not match with the model ' + f'{model.string}!' # type: ignore ) - if capice.group('prerelease') or model.group('prerelease'): - self._validate_prerelease(capice, model) + if capice.group('prerelease') or model.group('prerelease'): # type: ignore + self._validate_prerelease(capice, model) # type: ignore @staticmethod def _validate_prerelease(capice_version: re.Match, diff --git a/src/molgenis/capice/vep/amino_acids.py b/src/molgenis/capice/vep/amino_acids.py index a1a3289f..c2c987f6 100644 --- a/src/molgenis/capice/vep/amino_acids.py +++ b/src/molgenis/capice/vep/amino_acids.py @@ -23,6 +23,10 @@ def naa(self): return self.columns[1] def _process(self, dataframe: pd.DataFrame): - dataframe[self.columns] = dataframe[self.name].str.split('/', expand=True) - dataframe[self.naa].fillna(dataframe[self.oaa], inplace=True) + if dataframe[self.name].str.contains('/', regex=False).any(): + dataframe[self.columns] = dataframe[self.name].str.split('/', expand=True) + dataframe[self.naa].fillna(dataframe[self.oaa], inplace=True) + else: + dataframe[self.oaa] = dataframe[self.name] + dataframe[self.naa] = dataframe[self.oaa] return dataframe diff --git a/src/molgenis/capice/vep/consequence.py b/src/molgenis/capice/vep/consequence.py index c3a780fc..4a0d530e 100644 --- a/src/molgenis/capice/vep/consequence.py +++ b/src/molgenis/capice/vep/consequence.py @@ -60,6 +60,10 @@ def columns(self): 'is_splice_polypyrimidine_tract_variant' ] + @staticmethod + def _fillna(): + return 0 + def _process(self, dataframe: pd.DataFrame): splitted_consequence = dataframe[self.name].str.split('&', expand=True) raw_consequences = [] @@ -69,6 +73,7 @@ def _process(self, dataframe: pd.DataFrame): np.isin(splitted_consequence, current_consequence).any(axis=1), 1, 0 ) raw_consequences.append(current_consequence) + self._validate_consequences(splitted_consequence, raw_consequences) return dataframe diff --git a/src/molgenis/capice/vep/template.py b/src/molgenis/capice/vep/template.py index ca08b0ae..1f3834a9 100644 --- a/src/molgenis/capice/vep/template.py +++ b/src/molgenis/capice/vep/template.py @@ -39,9 +39,13 @@ def usable(self, value=False): def drop(self): return True + @staticmethod + def _fillna(): + return np.nan + def process(self, dataframe: pd.DataFrame): if dataframe[self.name].isnull().all(): - dataframe[self.columns] = np.nan + dataframe[self.columns] = self._fillna() return dataframe else: return self._process(dataframe) diff --git a/tests/capice/test_main_train.py b/tests/capice/test_main_train.py index c63228f0..73ad8284 100644 --- a/tests/capice/test_main_train.py +++ b/tests/capice/test_main_train.py @@ -42,6 +42,67 @@ def setUp(self): self.main.cross_validate = 2 self.main.n_iterations = 2 + def test_validate_train_features_duplicates_fail(self): + test_features = ['foo', 'bar', 'baz', 'foo'] + with self.assertRaises(KeyError) as e: + self.main._validate_train_features_duplicates(test_features) + # Double quotes since KeyError still adds single quotes to the error.exception + self.assertEqual( + "'Detected duplicate features in user supplied train features: foo'", + str(e.exception) + ) + + def test_validate_train_features_duplicates_pass(self): + test_features = ['foo', 'bar', 'baz'] + self.main._validate_train_features_duplicates(test_features) + + def test_component_reset_train_features(self): + user_input = ['REF', 'Amino_acids', 'foo'] + vep_processed = { + 'REF': ['Type', 'Length'], + 'Amino_acids': ['oAA', 'nAA'] + } + dataset = pd.DataFrame( + columns=['REF', 'oAA', 'nAA', 'foo'] + ) + observed = self.main._reset_processing_features( + user_input, vep_processed, dataset.columns) + # Set because order is not important + self.assertSetEqual(set(observed), {'REF', 'oAA', 'nAA', 'foo', 'Type', 'Length'}) + + def test_integration_reset_train_features(self): + with open(self.main.json_path, 'rt') as fh: + user_input = list(json.load(fh).keys()) + self.main._validate_train_features_duplicates(user_input) + data = self.main._load_file(additional_required_features=self.main.additional_required) + self.main._validate_features_present(data, user_input) + data_processed, vep_processed = self.main.process(data, user_input) + observed = self.main._reset_processing_features( + user_input, vep_processed, data_processed.columns + ) + expected = [ + 'PolyPhenCat', 'PolyPhenVal', 'cDNApos', 'relcDNApos', 'SIFTcat', 'SIFTval', + 'protPos', 'relProtPos', 'oAA', 'nAA', 'CDSpos', 'relCDSpos', 'REF', 'ALT', + 'is_regulatory_region_variant', 'is_regulatory_region_ablation', + 'is_regulatory_region_amplification', 'is_missense_variant', 'is_intron_variant', + 'is_upstream_gene_variant', 'is_downstream_gene_variant', 'is_synonymous_variant', + 'is_TF_binding_site_variant', 'is_splice_donor_variant', 'is_coding_sequence_variant', + 'is_splice_region_variant', 'is_stop_gained', 'is_splice_acceptor_variant', + 'is_splice_donor_5th_base_variant', 'is_splice_donor_region_variant', + 'is_splice_polypyrimidine_tract_variant', 'is_frameshift_variant', + 'is_3_prime_UTR_variant', 'is_inframe_insertion', + 'is_inframe_deletion', 'is_5_prime_UTR_variant', 'is_start_lost', + 'is_non_coding_transcript_exon_variant', 'is_non_coding_transcript_variant', + 'is_TFBS_ablation', 'is_TFBS_amplification', 'is_protein_altering_variant', + 'is_stop_lost', 'is_stop_retained_variant', 'is_transcript_ablation', + 'is_intergenic_variant', 'is_start_retained_variant', 'is_transcript_amplification', + 'is_incomplete_terminal_codon_variant', 'is_mature_miRNA_variant', + 'is_NMD_transcript_variant', 'is_feature_elongation', 'is_feature_truncation', + 'SpliceAI_pred_DP_AG', 'SpliceAI_pred_DP_AL', 'SpliceAI_pred_DP_DG', + 'SpliceAI_pred_DP_DL', 'SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL', + 'SpliceAI_pred_DS_DG', 'SpliceAI_pred_DS_DL', 'Type', 'Length', 'Grantham', 'phyloP'] + self.assertSetEqual(set(observed), set(expected)) + def test_integration_training(self): """ Integration test for the full training part of CAPICE. @@ -114,7 +175,7 @@ def test__set_eval_set_test(self): "test" """ processed_features = ['feat1', 'feat2'] - self.main.processed_features = processed_features + self.main.train_features = processed_features test_set = pd.DataFrame(data={ 'binarized_label': [0, 1, 0], 'feat1': [1, 0, 0], @@ -136,7 +197,7 @@ def test__set_eval_set(self): (length should be 2, as "test" shouldn't be included) """ processed_features = ['feat1', 'feat2'] - self.main.processed_features = processed_features + self.main.train_features = processed_features test_set = pd.DataFrame(data={ 'binarized_label': [0, 1, 0], 'feat1': [1, 0, 0], @@ -149,51 +210,107 @@ def test__set_eval_set(self): pd.testing.assert_series_equal(test_set['binarized_label'], eval_set[0][1]) self.assertEqual(2, len(eval_set[0])) - def test_processed_features(self): - with open( - os.path.join( - _project_root_directory, 'tests', 'resources', 'features_test.json' - ), 'rt' - ) as fh: - features = json.load(fh) - dataset = pd.DataFrame( + def test_full_processed_features(self): + loaded_dataset = pd.DataFrame( { - 'unused_feature_1': [1, 2, 3], - 'feature_1': ['foo', 'bar', 'baz'], - 'unused_feature_2': [3, 4, 5], - 'feature_foobarbaz': ['bar', 'baz', 'foo'], - 'feature_3_cat1': [10, 20, 30], - 'feature_3_cat2': [10, 20, 30], - 'feature_3_cat3': [10, 20, 30] + 'REF': ['C', 'GC'], + 'ALT': ['A', 'G'], + 'PolyPhen': [0.1, 0.01], + 'SIFT': [0.1, 0.01], + 'Other_feature': ['foo', 'bar'] } ) - self.main._get_processed_features(dataset, features.keys()) + features = ['REF', 'ALT', 'PolyPhen', 'SIFT'] + processed_data, vep_processed = self.main.process(loaded_dataset, features) + resetted_features = self.main._reset_processing_features( + features, vep_processed, processed_data.columns) self.assertSetEqual( - {'feature_1', - 'feature_foobarbaz', - 'feature_3_cat1', - 'feature_3_cat2', - 'feature_3_cat3'}, - set(self.main.processed_features) + {'REF', 'ALT', 'Length', 'Type', 'PolyPhenVal', 'PolyPhenCat', 'SIFTval', 'SIFTcat'}, + set(resetted_features) ) - def test_full_processed_features(self): - loaded_dataset = pd.DataFrame( + def test_component_feature_selection(self): + test_case = pd.DataFrame( { - 'ref': ['C', 'GC'], - 'alt': ['A', 'G'], - 'PolyPhen': [0.1, 0.01], - 'Sift': [0.1, 0.01], - 'Other_feature': ['foo', 'bar'] + 'chr': [1, 2, 3], + 'pos': [1, 2, 3], + 'REF': ['A', 'AT', 'ATCG'], + 'ALT': ['C', 'G', 'ATGCAB'], + 'REFSEQ_MATCH': ['foo', 'bar', 'baz'], # Included because of REF, can cause issues + 'ALTERNATIVE_FEATURE': ['foo', 'bar', 'baz'], + 'feature_1': ['foo_bar', 'bar', 'baz'] } ) - processed_data = self.main.process(loaded_dataset) - with open(self.main.json_path, 'rt') as fh: - features = json.load(fh).keys() - self.main._get_processed_features(processed_data, features) + user_input = ['REF', 'ALT', 'Type', 'Length', 'feature_1'] + processed_data, vep_processed = self.main.process(test_case, user_input) + self.assertIn( + 'REF', + vep_processed + ) self.assertSetEqual( - {'ref', 'alt', 'Length', 'Type', 'PolyPhenVal', 'PolyPhenCat'}, - set(self.main.processed_features) + set(vep_processed['REF']), + {'Type', 'Length'} + ) + processable_features = self.main._reset_processing_features( + user_input, vep_processed, processed_data.columns + ) + self.assertSetEqual( + set(processable_features), + {'REF', 'ALT', 'Type', 'Length', 'feature_1'} + ) + fully_processed_data, processed_features = self.main.categorical_process( + processed_data, train_features=processable_features, processing_features=None + ) + # Test to see if REF is successfully processed + self.assertIn( + 'REF_A', + fully_processed_data.columns + ) + # Test to see if REFSEQ is successfully skipped + self.assertNotIn( + 'REFSEQ_MATCH_foo', + fully_processed_data.columns + ) + # Another test to see if feature_1 is successfully processed + self.assertIn( + 'feature_1_foo_bar', + fully_processed_data.columns + ) + # Test to see if category A is successfully saved for REF + self.assertIn( + 'A', + processed_features['REF'] + ) + + self.main._set_train_features( + processable_features, processed_features + ) + + # Test to see if REF_A is successfully inserted into the final training features + self.assertIn( + 'REF_A', + self.main.train_features + ) + + # Test to see if REFSEQ_MATCH_foo is successfully skipped + self.assertNotIn( + 'REFSEQ_MATCH_foo', + self.main.train_features + ) + # Test to see if multiple underscores also get successfully inserted + self.assertIn( + 'feature_1_foo_bar', + self.main.train_features + ) + self.assertSetEqual( + set(self.main.train_features), + { + 'REF_A', 'REF_AT', 'REF_ATCG', + 'ALT_C', 'ALT_G', 'ALT_ATGCAB', + 'Type_DELINS', 'Type_SNV', + 'Length', + 'feature_1_foo_bar', 'feature_1_bar', 'feature_1_baz' + } ) diff --git a/tests/capice/utilities/test_categorical_processor.py b/tests/capice/utilities/test_categorical_processor.py new file mode 100644 index 00000000..baa89410 --- /dev/null +++ b/tests/capice/utilities/test_categorical_processor.py @@ -0,0 +1,231 @@ +import unittest + +import numpy as np +import pandas as pd + +from molgenis.capice.utilities.enums import Column +from tests.capice.test_templates import set_up_impute_preprocess, teardown +from molgenis.capice.utilities.categorical_processor import CategoricalProcessor + + +class TestCategoricalProcessor(unittest.TestCase): + @classmethod + def setUp(cls): + cls.preprocessor = CategoricalProcessor() + cls.chr_pos_ref_alt_testcase = pd.DataFrame( + { + 'chr': [1, 2, 3, 4, 5], + 'pos': [1, 2, 3, 4, 5], + 'REF': [1, 2, 3, 4, 5], + 'ALT': [1, 2, 3, 4, 5] + } + ) + cls.main, cls.model = set_up_impute_preprocess() + + @classmethod + def tearDownClass(cls) -> None: + teardown() + + @staticmethod + def creat_other_column(value: str) -> str: + return '_'.join([value, Column.other.value]) + + def test_unit_preprocessing_file(self): + """ + Unit test for the preprocessor to see if the preprocessor works just + the file header information. + """ + print('Preprocessing (unit) (file)') + self.main.categorical_process( + loaded_data=self.main.process( + self.main._load_file(), process_features=self.model.vep_features.keys() + )[0], processing_features=self.model.processable_features + ) + + def test_component_preprocessing(self): + """ + component test for preprocessing. All columns within the CADD + features should be processed. Furthermore, + within all processed columns, + there should not be 1 or more column that is still + considered categorical. + """ + print('Preprocessing (component)') + processed_file = self.main.categorical_process( + loaded_data=self.main.process( + self.main._load_file(), process_features=self.model.vep_features.keys() + )[0], processing_features=self.model.processable_features + )[0] + model_features = self.model.get_booster().feature_names + processed_columns = processed_file.columns + for feature in model_features: + # Check if all model features are present before predicting + self.assertIn(feature, processed_columns) + # Check if none of the processed columns can be marked as categorical + self.assertEqual( + len(processed_file[model_features].select_dtypes(include=["O"]).columns), + 0 + ) + + def test_preprocessing_train(self): + """ + Component test for the preprocessing part with train=True. + """ + data = pd.DataFrame( + { + 'foo': ['a', 'b', 'c', np.nan, np.nan, np.nan], + 'bar': ['a', np.nan, np.nan, np.nan, np.nan, np.nan], + 'baz': ['a', 'b', 'c', 'd', 'e', 'f'], + 'feature_1': [1, 2, 3, 4, np.nan, np.nan], + 'feature_excluded': [1, 2, 3, 4, np.nan, np.nan], + 'chr': [1, 2, 3, 4, 5, 6], + 'pos': [100, 200, 300, 400, 500, 600], + 'REF': ['A', 'T', 'A', 'T', 'A', 'T'], + 'ALT': ['G', 'C', 'G', 'C', 'G', 'C'] + } + ) + user_input_features = ['foo', 'bar', 'baz', 'feature_1'] + processor = CategoricalProcessor() + observed = processor.process(data, processable_features=user_input_features)[0] + expected = pd.DataFrame( + { + 'foo_a': [1, 0, 0, 0, 0, 0], + 'foo_b': [0, 1, 0, 0, 0, 0], + 'foo_c': [0, 0, 1, 0, 0, 0], + self.creat_other_column('foo'): [0, 0, 0, 1, 1, 1], + 'bar_a': [1, 0, 0, 0, 0, 0], + self.creat_other_column('bar'): [0, 1, 1, 1, 1, 1], + 'baz_a': [1, 0, 0, 0, 0, 0], + 'baz_b': [0, 1, 0, 0, 0, 0], + 'baz_c': [0, 0, 1, 0, 0, 0], + 'baz_d': [0, 0, 0, 1, 0, 0], + 'baz_e': [0, 0, 0, 0, 1, 0], + self.creat_other_column('baz'): [0, 0, 0, 0, 0, 1], + 'REF': ['A', 'T', 'A', 'T', 'A', 'T'], + 'ALT': ['G', 'C', 'G', 'C', 'G', 'C'], + 'feature_1': [1, 2, 3, 4, np.nan, np.nan], + 'feature_excluded': [1, 2, 3, 4, np.nan, np.nan], + 'chr': [1, 2, 3, 4, 5, 6], + 'pos': [100, 200, 300, 400, 500, 600], + 'chr_pos_ref_alt': [ + '1_VeryUniqueCAPICESeparator_100_VeryUniqueCAPICESeparator_' + 'A_VeryUniqueCAPICESeparator_G', + '2_VeryUniqueCAPICESeparator_200_VeryUniqueCAPICESeparator_' + 'T_VeryUniqueCAPICESeparator_C', + '3_VeryUniqueCAPICESeparator_300_VeryUniqueCAPICESeparator_' + 'A_VeryUniqueCAPICESeparator_G', + '4_VeryUniqueCAPICESeparator_400_VeryUniqueCAPICESeparator_' + 'T_VeryUniqueCAPICESeparator_C', + '5_VeryUniqueCAPICESeparator_500_VeryUniqueCAPICESeparator_' + 'A_VeryUniqueCAPICESeparator_G', + '6_VeryUniqueCAPICESeparator_600_VeryUniqueCAPICESeparator_' + 'T_VeryUniqueCAPICESeparator_C', + ] + } + ) + pd.testing.assert_frame_equal( + observed.sort_index(axis=1), expected.sort_index(axis=1), check_dtype=False + ) + + def test_creation_other(self): + test_case = pd.concat( + [ + self.chr_pos_ref_alt_testcase, + pd.DataFrame( + { + 'foo': ['bar', 'baz', 'barz', 'foobar', 'foobaz', 'last'] + } + ) + ], axis=1 + ) + observed_df, observed_dict = self.preprocessor.process(test_case, processable_features=[ + 'foo']) + self.assertIn( + 'foo', + observed_dict.keys() + ) + self.assertIn( + Column.other.value, + observed_dict['foo'] + ) + self.assertIn(self.creat_other_column('foo'), observed_df.columns) + + def test_creation_other_notin(self): + test_case = pd.concat( + [ + self.chr_pos_ref_alt_testcase, + pd.DataFrame( + { + 'foo': ['bar', 'baz', 'barz', 'foobar', 'foobaz'] + } + ) + ], axis=1 + ) + observed_df, observed_dict = self.preprocessor.process(test_case, processable_features=[ + 'foo']) + self.assertIn( + 'foo', + observed_dict.keys() + ) + self.assertNotIn( + Column.other.value, + observed_dict['foo'] + ) + self.assertNotIn(self.creat_other_column('foo'), observed_df.columns) + + def test_other_in_top_5(self): + # Tests that, if "other" occurs in the top 5 categories, only this "other" feature gets + # sampled and no other sample get sampled into the "other" category. + test_case = pd.concat( + [ + self.chr_pos_ref_alt_testcase, + pd.DataFrame( + { + 'foo': ['other', 'other', 'foo', 'bar', 'baz', 'foobar', 'foobaz'] + } + ) + ], axis=1 + ) + observed_df, observed_dict = self.preprocessor.process(test_case, processable_features=[ + 'foo']) + test_series = observed_df['foo_other'] + self.assertFalse(test_series[test_series > 0].size > 2, + msg=f'Actual size: {test_series[test_series > 0].size}') + self.assertIn( + self.creat_other_column('foo'), + observed_df.columns + ) + + def test__create_preservation_col(self): + input_data_frame = pd.DataFrame( + {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'REF': ['A', 'T', 'C'], + 'ALT': ['G', 'A', 'T']}) + expected_output = pd.DataFrame( + {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'REF': ['A', 'T', 'C'], + 'ALT': ['G', 'A', 'T'], + 'chr_pos_ref_alt': + ['1_VeryUniqueCAPICESeparator_123_VeryUniqueCAPICESeparator_' + 'A_VeryUniqueCAPICESeparator_G', + '2_VeryUniqueCAPICESeparator_456_VeryUniqueCAPICESeparator_' + 'T_VeryUniqueCAPICESeparator_A', + '4_VeryUniqueCAPICESeparator_789_VeryUniqueCAPICESeparator_' + 'C_VeryUniqueCAPICESeparator_T'] + } + ) + self.preprocessor._create_preservation_col(input_data_frame) + + pd.testing.assert_frame_equal(expected_output, input_data_frame) + + def test__get_categorical_columns(self): + preprocessor = CategoricalProcessor() + input_data_frame = pd.DataFrame( + {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'REF': ['A', 'T', 'C'], + 'ALT': ['G', 'A', 'T']}) + features = preprocessor._get_categorical_columns(input_data_frame, processable_features=[ + 'REF', 'ALT']) + self.assertIn('REF', features.keys()) + self.assertIn('ALT', features.keys()) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/capice/utilities/test_file_postprocessor.py b/tests/capice/utilities/test_file_postprocessor.py index 257f9d73..0c8698ed 100644 --- a/tests/capice/utilities/test_file_postprocessor.py +++ b/tests/capice/utilities/test_file_postprocessor.py @@ -45,8 +45,8 @@ def test_load_file_pre_processor(self): { "chr": [1, 2, 3], "pos": [100, 200, 300], - "ref": ['A', 'T', 'G'], - "alt": ['T', 'G', 'A'], + "REF": ['A', 'T', 'G'], + "ALT": ['T', 'G', 'A'], "id_source": ['foo', 'foo', 'bar'], "feature": ['bar', 'bar', 'buz'], "gene_name": ['g1', 'g2', 'g3'], diff --git a/tests/capice/utilities/test_load_file_postprocessor.py b/tests/capice/utilities/test_load_file_postprocessor.py index c0af6215..76558199 100644 --- a/tests/capice/utilities/test_load_file_postprocessor.py +++ b/tests/capice/utilities/test_load_file_postprocessor.py @@ -32,8 +32,8 @@ def test_process(self): { 'chr': [1], 'pos': [123], - 'ref': ['A'], - 'alt': ['G'], + 'REF': ['A'], + 'ALT': ['G'], 'gene_id': [123], 'id_source': ['hgnc'], 'feature': ['NM1.123'], diff --git a/tests/capice/utilities/test_manual_vep_processor.py b/tests/capice/utilities/test_manual_vep_processor.py index 31756a49..461bc421 100644 --- a/tests/capice/utilities/test_manual_vep_processor.py +++ b/tests/capice/utilities/test_manual_vep_processor.py @@ -17,8 +17,8 @@ def setUpClass(cls) -> None: { 'chr': {0: '1', 1: '1'}, 'pos': {0: 1, 1: 10042538}, - 'ref': {0: 'C', 1: 'C'}, - 'alt': {0: 'T', 1: 'T'}, + 'REF': {0: 'C', 1: 'C'}, + 'ALT': {0: 'T', 1: 'T'}, 'Consequence': {0: 'missense_variant', 1: 'downstream_gene_variant'}, 'gene_name': {0: 'NMNAT1', 1: 'NMNAT1'}, 'SourceID': {0: 'HGNC', 1: 'HGNC'}, @@ -36,6 +36,8 @@ def setUpClass(cls) -> None: } ) cls.annotator = ManualVEPProcessor() + cls.user_input_features = ['REF', 'PolyPhen', 'SIFT', 'Consequence', 'cDNA_position', + 'CDS_position', 'Protein_position', 'Amino_acids'] def setUp(self) -> None: print('Testing case:') @@ -99,8 +101,8 @@ def test_component_annotator(self): [ 'chr', 'pos', - 'ref', - 'alt', + 'REF', + 'ALT', 'gene_name', 'SourceID', 'HGNC_ID', @@ -113,7 +115,7 @@ def test_component_annotator(self): expected_processed_columns ], axis=1 ) - outcome = self.annotator.process(self.dataset) + outcome = self.annotator.process(self.dataset, self.user_input_features) # if numpy.array dtype not given, # then the type will be determined as the minimum type required to hold the # objects in the sequence. this minimal type is system dependent. @@ -128,8 +130,8 @@ def test_bug_attributeerror_template_sift_polyphen(self): { 'chr': ['1', '2'], 'pos': [100, 200], - 'ref': ['A', 'GCC'], - 'alt': ['C', 'C'], + 'REF': ['A', 'GCC'], + 'ALT': ['C', 'C'], 'SIFT': [np.nan, np.nan], 'PolyPhen': [np.nan, np.nan] } @@ -139,8 +141,8 @@ def test_bug_attributeerror_template_sift_polyphen(self): { 'chr': ['1', '2'], 'pos': [100, 200], - 'ref': ['A', 'GCC'], - 'alt': ['C', 'C'], + 'REF': ['A', 'GCC'], + 'ALT': ['C', 'C'], 'SIFTcat': [np.nan, np.nan], 'SIFTval': [np.nan, np.nan], 'PolyPhenCat': [np.nan, np.nan], @@ -148,10 +150,44 @@ def test_bug_attributeerror_template_sift_polyphen(self): } ) annotator = ManualVEPProcessor() - out_dataframe = annotator.process(bugged_dataframe) + out_dataframe = annotator.process(bugged_dataframe, self.user_input_features) # Testing for expected dataframe columns, since it processes more. pd.testing.assert_frame_equal(expected_dataframe, out_dataframe[expected_dataframe.columns]) + @staticmethod + def prepare_getter_tests(): + data = pd.DataFrame( + { + 'REF': ['A', 'C'], + 'ALT': ['T', 'G'], + 'PolyPhen': [0.08, 0.98] + } + ) + user_input = ['REF', 'PolyPhen'] + annotator = ManualVEPProcessor() + annotator.process(data, user_input) + return annotator + + def test_getter_vep_input(self): + data = pd.DataFrame( + { + 'REF': ['A', 'C'], + 'ALT': ['T', 'G'], + 'PolyPhen': [0.08, 0.98] + } + ) + user_input = ['REF', 'PolyPhen'] + annotator = ManualVEPProcessor() + annotator.process(data, user_input) + observed = annotator.get_feature_processes() + expected_keys = ['REF', 'PolyPhen'] + expected_values = ['Type', 'Length', 'PolyPhenCat', 'PolyPhenVal'] + for input_feature in observed.keys(): + self.assertIn(input_feature, expected_keys) + for output_features in observed.values(): + for feature in output_features: + self.assertIn(feature, expected_values) + if __name__ == '__main__': unittest.main() diff --git a/tests/capice/utilities/test_predict.py b/tests/capice/utilities/test_predict.py index a9262dce..304b9efb 100644 --- a/tests/capice/utilities/test_predict.py +++ b/tests/capice/utilities/test_predict.py @@ -24,11 +24,11 @@ def test_unit_prediction(self): """ print('Prediction (unit)') self.main.predict( - self.main.preprocess( + self.main.categorical_process( self.main.process( - self.main._load_file() - ), model_features=self.model.get_booster().feature_names - ) + self.main._load_file(), process_features=self.model.vep_features.keys() + )[0], processing_features=self.model.processable_features + )[0] ) def test_component_prediction(self): @@ -38,11 +38,11 @@ def test_component_prediction(self): """ print('Prediction (component)') prediction = self.main.predict( - self.main.preprocess( + self.main.categorical_process( self.main.process( - self.main._load_file() - ), model_features=self.model.get_booster().feature_names - ) + self.main._load_file(), process_features=self.model.vep_features.keys() + )[0], processing_features=self.model.processable_features + )[0] ) # Combined sum of the prediction score should be higher than 0 self.assertGreater(prediction[Column.score.value].sum(), 0) diff --git a/tests/capice/utilities/test_predictor.py b/tests/capice/utilities/test_predictor.py index 8b8d77e7..3958c637 100644 --- a/tests/capice/utilities/test_predictor.py +++ b/tests/capice/utilities/test_predictor.py @@ -10,11 +10,11 @@ def setUpClass(cls): print('Setting up.') main, model = set_up_impute_preprocess() cls.predictor = Predictor(model) - cls.dataset = main.preprocess( + cls.dataset = main.categorical_process( main.process( - main._load_file() - ), model_features=model.get_booster().feature_names - ) + main._load_file(), process_features=model.vep_features.keys() + )[0], processing_features=model.processable_features + )[0] def test_predict(self): observed = self.predictor.predict(self.dataset) diff --git a/tests/capice/utilities/test_preprocessing.py b/tests/capice/utilities/test_preprocessing.py deleted file mode 100644 index 2c1f38c6..00000000 --- a/tests/capice/utilities/test_preprocessing.py +++ /dev/null @@ -1,96 +0,0 @@ -import unittest - -from tests.capice.test_templates import set_up_impute_preprocess, teardown - - -class TestPreprocessing(unittest.TestCase): - @classmethod - def setUpClass(cls): - print('Setting up.') - cls.main, cls.model = set_up_impute_preprocess() - - @classmethod - def tearDownClass(cls): - print('Tearing down.') - teardown() - - def setUp(self): - print('Testing case:') - - def test_unit_preprocessing_file(self): - """ - Unit test for the preprocessor to see if the preprocessor works just - the file header information. - """ - print('Preprocessing (unit) (file)') - self.main.preprocess( - loaded_data=self.main.process( - self.main._load_file() - ), model_features=self.model.get_booster().feature_names - ) - - def test_component_preprocessing(self): - """ - component test for preprocessing. All columns within the CADD - features should be processed. Furthermore, - within all processed columns, - there should not be 1 or more column that is still - considered categorical. - """ - print('Preprocessing (component)') - processed_file = self.main.preprocess( - self.main.process( - self.main._load_file() - ), model_features=self.model.get_booster().feature_names - ) - model_features = self.model.get_booster().feature_names - processed_columns = processed_file.columns - for feature in model_features: - # Check if all model features are present before predicting - self.assertIn(feature, processed_columns) - # Check if none of the processed columns can be marked as categorical - self.assertEqual( - len(processed_file[model_features].select_dtypes(include=["O"]).columns), - 0 - ) - - def test_component_preprocessing_train(self): - """ - Component test for the preprocessing part with train=True. - """ - print('Preprocessing (train) (component)') - preprocessed_file = self.main.preprocess( - self.main.process( - self.main._load_file() - ) - ) - - # Test if all columns matching, - # or starting with features within the imputing - # file are not classified objects. - impute_features = self.model.model_features - processed_columns = preprocessed_file.columns - present_features = 1 - # Should be one, since the for loop quits before - # it can finish the last add_one - test_features = [] - add_one = False - for feature in impute_features: - if add_one: - present_features += 1 - add_one = False - for processed_feature in processed_columns: - if processed_feature.startswith(feature): - add_one = True - test_features.append(processed_feature) - # Test if all impute features are present - self.assertEqual(len(impute_features), present_features) - # Test if no columns are still objects. - self.assertEqual( - len(preprocessed_file[test_features].select_dtypes(include=["O"]).columns), - 0 - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/capice/utilities/test_preprocessor.py b/tests/capice/utilities/test_preprocessor.py deleted file mode 100644 index bdbfa92d..00000000 --- a/tests/capice/utilities/test_preprocessor.py +++ /dev/null @@ -1,85 +0,0 @@ -import unittest - -import pandas as pd - -from molgenis.capice.utilities.preprocessor import PreProcessor - - -def get_uint8_array(values_list): - return pd.array(values_list, dtype='uint8') - - -class TestPreprocessor(unittest.TestCase): - @classmethod - def setUp(cls): - print('Setting up.') - cls.preprocessor = PreProcessor([]) - - def test__create_preservation_col(self): - input_data_frame = pd.DataFrame( - {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'], - 'alt': ['G', 'A', 'T']}) - expected_output = pd.DataFrame( - {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'], - 'alt': ['G', 'A', 'T'], - 'chr_pos_ref_alt': - ['1_VeryUniqueCAPICESeparator_123_VeryUniqueCAPICESeparator_' - 'A_VeryUniqueCAPICESeparator_G', - '2_VeryUniqueCAPICESeparator_456_VeryUniqueCAPICESeparator_' - 'T_VeryUniqueCAPICESeparator_A', - '4_VeryUniqueCAPICESeparator_789_VeryUniqueCAPICESeparator_' - 'C_VeryUniqueCAPICESeparator_T'] - } - ) - actual_output = self.preprocessor._create_preservation_col(input_data_frame) - - pd.testing.assert_frame_equal(expected_output, actual_output) - - def test__is_train(self): - self.assertEqual(False, self.preprocessor.train) - self.preprocessor._is_train() - self.assertEqual(True, self.preprocessor.train) - - def test__get_categorical_columns(self): - input_data_frame = pd.DataFrame( - {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'], - 'alt': ['G', 'A', 'T']}) - self.preprocessor._get_categorical_columns(input_data_frame) - self.assertEqual(['ref', 'alt'], self.preprocessor.objects) - - def test__process_objects_train_false(self): - self.preprocessor.objects = ['ref', 'alt', 'blaat'] - self.preprocessor.model_features = ['blaat_something'] - input_data_frame = pd.DataFrame( - {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'], - 'alt': ['G', 'A', 'T'], 'blaat': ['some', 'value', 'something']}) - - expected = pd.DataFrame( - {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'], - 'alt': ['G', 'A', 'T'], - 'blaat_other': get_uint8_array([1, 1, 0]), - 'blaat_something': get_uint8_array([0, 0, 1]) - }) - observed = self.preprocessor._process_objects(input_data_frame) - pd.testing.assert_frame_equal(expected, observed) - - def test__process_objects_train_true(self): - self.preprocessor.train = True - self.preprocessor.objects = ['ref', 'alt', 'blaat'] - self.preprocessor.model_features = ['blaat_something'] - input_data_frame = pd.DataFrame( - {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'], - 'alt': ['G', 'A', 'T'], 'blaat': ['some', 'value', 'here']}) - expected = pd.DataFrame( - {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref_A': get_uint8_array([1, 0, 0]), - 'ref_C': get_uint8_array([0, 0, 1]), 'ref_T': get_uint8_array([0, 1, 0]), - 'alt_A': get_uint8_array([0, 1, 0]), 'alt_G': get_uint8_array([1, 0, 0]), - 'alt_T': get_uint8_array([0, 0, 1]), 'blaat_here': get_uint8_array([0, 0, 1]), - 'blaat_some': get_uint8_array([1, 0, 0]), - 'blaat_value': get_uint8_array([0, 1, 0])}) - observed = self.preprocessor._process_objects(input_data_frame) - pd.testing.assert_frame_equal(expected, observed) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/capice/validators/test_post_file_parse_validator.py b/tests/capice/validators/test_post_file_parse_validator.py index adecb669..ae3e57b7 100644 --- a/tests/capice/validators/test_post_file_parse_validator.py +++ b/tests/capice/validators/test_post_file_parse_validator.py @@ -14,8 +14,8 @@ def setUpClass(cls) -> None: { 'chr': [1, 2], 'pos': [100, 200], - 'ref': ['A', 'A'], - 'alt': ['T', 'T'], + 'REF': ['A', 'A'], + 'ALT': ['T', 'T'], 'feat1': ['foo', 'bar'] } ) @@ -62,7 +62,7 @@ def test_validation_incorrect_required_columns_preset_required(self): self.assertRaises( KeyError, self.validator.validate_minimally_required_columns, - self.dataset.drop(columns='ref'), + self.dataset.drop(columns='REF'), additional_required_features='feat1' ) diff --git a/tests/capice/validators/test_post_vep_processing_validator.py b/tests/capice/validators/test_post_vep_processing_validator.py index 06b05aaf..44d8fb8b 100644 --- a/tests/capice/validators/test_post_vep_processing_validator.py +++ b/tests/capice/validators/test_post_vep_processing_validator.py @@ -14,14 +14,13 @@ def setUpClass(cls) -> None: { 'chr': [1, 2], 'pos': [100, 200], - 'ref': ['A', 'A'], - 'alt': ['T', 'T'], + 'REF': ['A', 'A'], + 'ALT': ['T', 'T'], 'feat1': ['foo', 'bar'] } ) - cls.validator = PostVEPProcessingValidator( - load_model(ResourceFile.XGB_BOOSTER_POC_UBJ.value) - ) + cls.validator = PostVEPProcessingValidator() + cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_UBJ.value) @classmethod def tearDownClass(cls) -> None: @@ -33,7 +32,8 @@ def test_validate_features_present_incorrect(self): self.assertRaises( KeyError, self.validator.validate_features_present, - self.dataset + self.dataset, + self.model.vep_features.values() ) diff --git a/tests/capice/vep/test_amino_acids.py b/tests/capice/vep/test_amino_acids.py index b7b3ba53..49f7b989 100644 --- a/tests/capice/vep/test_amino_acids.py +++ b/tests/capice/vep/test_amino_acids.py @@ -19,6 +19,14 @@ def test_process(self): 'nAA': ['G', 'C', 'C']}) pd.testing.assert_frame_equal(expected, observed) + def test_process_no_alt(self): + dataframe = pd.DataFrame({'Amino_acids': ['A', 'R', 'G']}) + observed = self.aa.process(dataframe) + expected = pd.DataFrame({'Amino_acids': ['A', 'R', 'G'], + 'oAA': ['A', 'R', 'G'], + 'nAA': ['A', 'R', 'G']}) + pd.testing.assert_frame_equal(expected, observed) + if __name__ == '__main__': unittest.main() diff --git a/tests/capice/vep/test_consequence.py b/tests/capice/vep/test_consequence.py index 05c72719..ea7a7b8e 100644 --- a/tests/capice/vep/test_consequence.py +++ b/tests/capice/vep/test_consequence.py @@ -73,6 +73,25 @@ def test_consequence(self): pd.testing.assert_frame_equal(observerd.sort_index(axis=1), expected.sort_index( axis=1), check_dtype=False) + def test_non_coding(self): + data = pd.DataFrame({ + 'variants': ['variant_1', 'variant_2', 'variant_3'], + 'Consequence': [np.nan, np.nan, np.nan] + }) + columns = data.columns + expected_altered = self.expected_data.copy(deep=True) + # Easier to locate the ones in self.expected_data than to hardcode a new one + expected_altered.loc[1, 'is_start_lost'] = 0 + expected_altered.loc[0, 'is_stop_lost'] = 0 + expected_altered.loc[0, 'is_transcript_ablation'] = 0 + expected = pd.concat([data, expected_altered], axis=1) + observed = Consequence().process(data) + self.assertFalse(observed[observed.columns.difference(columns)].isnull().values.any()) + pd.testing.assert_frame_equal( + observed.sort_index(axis=1), + expected.sort_index(axis=1) + ) + def test_consequence_warning(self): """ Tests that when a consequence is encountered that is not present within the processor diff --git a/tests/capice/vep/test_length.py b/tests/capice/vep/test_length.py index f41fc484..f10c6437 100644 --- a/tests/capice/vep/test_length.py +++ b/tests/capice/vep/test_length.py @@ -13,12 +13,12 @@ def setUpClass(cls): def test_process(self): dataframe = pd.DataFrame({ - 'ref': ['ATAG', 'A', 'C', 'AC'], - 'alt': ['A', 'ATG', 'A', 'GT']}) + 'REF': ['ATAG', 'A', 'C', 'AC'], + 'ALT': ['A', 'ATG', 'A', 'GT']}) observed = self.length.process(dataframe) expected = pd.DataFrame({ - 'ref': ['ATAG', 'A', 'C', 'AC'], - 'alt': ['A', 'ATG', 'A', 'GT'], + 'REF': ['ATAG', 'A', 'C', 'AC'], + 'ALT': ['A', 'ATG', 'A', 'GT'], 'Length': [3, 2, 0, 0]}) pd.testing.assert_frame_equal(expected, observed) diff --git a/tests/capice/vep/test_type.py b/tests/capice/vep/test_type.py index cf8cccc2..e7aaa74c 100644 --- a/tests/capice/vep/test_type.py +++ b/tests/capice/vep/test_type.py @@ -12,12 +12,12 @@ def setUpClass(cls): cls.type = type.Type() def test_process(self): - input_data_frame = pd.DataFrame({'ref': ['C', 'CA', 'CA', 'C', 'CA', 'CA'], - 'alt': ['G', 'GCC', 'GG', 'CG', 'G', 'C']}) + input_data_frame = pd.DataFrame({'REF': ['C', 'CA', 'CA', 'C', 'CA', 'CA'], + 'ALT': ['G', 'GCC', 'GG', 'CG', 'G', 'C']}) actual_output = self.type.process(input_data_frame) expected_output = pd.DataFrame({ - 'ref': ['C', 'CA', 'CA', 'C', 'CA', 'CA'], - 'alt': ['G', 'GCC', 'GG', 'CG', 'G', 'C'], + 'REF': ['C', 'CA', 'CA', 'C', 'CA', 'CA'], + 'ALT': ['G', 'GCC', 'GG', 'CG', 'G', 'C'], 'Type': ['SNV', 'DELINS', 'DELINS', 'INS', 'DELINS', 'DEL']}) pd.testing.assert_frame_equal(actual_output, expected_output) diff --git a/tests/resources/xgb_booster_poc.ubj b/tests/resources/xgb_booster_poc.ubj index d82d8d63..324341a2 100644 Binary files a/tests/resources/xgb_booster_poc.ubj and b/tests/resources/xgb_booster_poc.ubj differ