diff --git a/resources/train_features.json b/resources/train_features.json
index 6fa0312a..160a6d1a 100644
--- a/resources/train_features.json
+++ b/resources/train_features.json
@@ -1,54 +1,13 @@
 {
-    "PolyPhenCat": null,
-    "PolyPhenVal": null,
-    "cDNApos": null,
-    "relcDNApos": null,
-    "SIFTcat": null,
-    "SIFTval": null,
-    "protPos": null,
-    "relProtPos": null,
-    "oAA": null,
-    "nAA": null,
-    "CDSpos": null,
-    "relCDSpos": null,
-    "ref": null,
-    "alt": null,
-    "is_regulatory_region_variant": null,
-    "is_regulatory_region_ablation": null,
-    "is_regulatory_region_amplification": null,
-    "is_missense_variant": null,
-    "is_intron_variant": null,
-    "is_upstream_gene_variant": null,
-    "is_downstream_gene_variant": null,
-    "is_synonymous_variant": null,
-    "is_TF_binding_site_variant": null,
-    "is_splice_donor_variant": null,
-    "is_coding_sequence_variant": null,
-    "is_splice_region_variant": null,
-    "is_stop_gained": null,
-    "is_splice_acceptor_variant": null,
-    "is_frameshift_variant": null,
-    "is_3_prime_UTR_variant": null,
-    "is_inframe_insertion": null,
-    "is_inframe_deletion": null,
-    "is_5_prime_UTR_variant": null,
-    "is_start_lost": null,
-    "is_non_coding_transcript_exon_variant": null,
-    "is_non_coding_transcript_variant": null,
-    "is_TFBS_ablation": null,
-    "is_TFBS_amplification": null,
-    "is_protein_altering_variant": null,
-    "is_stop_lost": null,
-    "is_stop_retained_variant": null,
-    "is_transcript_ablation": null,
-    "is_intergenic_variant": null,
-    "is_start_retained_variant": null,
-    "is_transcript_amplification": null,
-    "is_incomplete_terminal_codon_variant": null,
-    "is_mature_miRNA_variant": null,
-    "is_NMD_transcript_variant": null,
-    "is_feature_elongation": null,
-    "is_feature_truncation": null,
+    "PolyPhen": null,
+    "SIFT": null,
+    "cDNA_position": null,
+    "CDS_position": null,
+    "Protein_position": null,
+    "Amino_acids": null,
+    "REF": null,
+    "ALT": null,
+    "Consequence": null,
     "SpliceAI_pred_DP_AG": null,
     "SpliceAI_pred_DP_AL": null,
     "SpliceAI_pred_DP_DG": null,
@@ -57,8 +16,6 @@
     "SpliceAI_pred_DS_AL": null,
     "SpliceAI_pred_DS_DG": null,
     "SpliceAI_pred_DS_DL": null,
-    "Type": null,
-    "Length": null,
     "Grantham": null,
     "phyloP": null
 }
\ No newline at end of file
diff --git a/src/molgenis/capice/cli/args_handler_parent.py b/src/molgenis/capice/cli/args_handler_parent.py
index 0abbf389..a0ba0fc9 100644
--- a/src/molgenis/capice/cli/args_handler_parent.py
+++ b/src/molgenis/capice/cli/args_handler_parent.py
@@ -119,6 +119,7 @@ def _retrieve_argument_from_list(self,
             return self._single_argument_retriever(arg, arg_name, has_default)
         except IOError as e:
             self.parser.error(e)
+            return None
 
     @staticmethod
     def _single_argument_retriever(arg: list | None,
diff --git a/src/molgenis/capice/cli/args_handler_predict.py b/src/molgenis/capice/cli/args_handler_predict.py
index 8389ec4b..df8c19e8 100644
--- a/src/molgenis/capice/cli/args_handler_predict.py
+++ b/src/molgenis/capice/cli/args_handler_predict.py
@@ -20,7 +20,8 @@ def _extension(self):
 
     @property
     def _model_extension(self) -> tuple[str]:
-        return '.json', '.ubj'
+        # Ignore because the amount of values of tuple does not matter.
+        return '.json', '.ubj'  # type: ignore
 
     def _model_extension_str(self) -> str:
         return self._join_extensions(self._model_extension)
diff --git a/src/molgenis/capice/main_capice.py b/src/molgenis/capice/main_capice.py
index c7ca7e88..d71d0c3b 100644
--- a/src/molgenis/capice/main_capice.py
+++ b/src/molgenis/capice/main_capice.py
@@ -1,12 +1,15 @@
+import os
 from abc import ABC, abstractmethod
 
+import pandas as pd
+
 from molgenis.capice.core.logger import Logger
 from molgenis.capice.utilities.enums import Column
 from molgenis.capice.core.capice_manager import CapiceManager
 from molgenis.capice.utilities.input_parser import InputParser
 from molgenis.capice.core.capice_exporter import CapiceExporter
-from molgenis.capice.utilities.preprocessor import PreProcessor
 from molgenis.capice.utilities.manual_vep_processor import ManualVEPProcessor
+from molgenis.capice.utilities.categorical_processor import CategoricalProcessor
 from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor
 from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator
 
@@ -45,7 +48,7 @@ def __init__(self, input_path, output_path, output_given):
     def run(self):
         pass
 
-    def _load_file(self, additional_required_features: list = None):
+    def _load_file(self, additional_required_features: list | None = None):
         """
         Function to load the input TSV file into main
         :return: pandas DataFrame
@@ -66,35 +69,55 @@ def _load_file(self, additional_required_features: list = None):
         return input_file
 
     @staticmethod
-    def process(loaded_data):
+    def process(loaded_data: pd.DataFrame, process_features: list[str]) -> tuple[
+        pd.DataFrame, dict[str, list[str]]
+    ]:
+        # Returns might look funky, but Google pydoc does not support multiple return statements.
         """
-        Function to process the VEP features to CAPICE features.
+        Function to call the ManualVEPProcessor over loaded_data using the supplied
+        process_features list.
+
+        Args:
+            loaded_data:
+                The pandas dataframe over which the VEP features should be processed.
+
+            process_features:
+                List containing either all input features, possibly containing VEP features (in
+                the case of train) or already all input features that can be VEP processed (in
+                case of predict).
+
+        Returns:
+            tuple:
+                Tuple [0] containing: The output dataframe containing all VEP processed features
+                according to process_features. Depending on the property "drop" will drop the
+                feature present in process_features from the columns of the output dataframe.
+                Tuple [1] containing: The output dictionary containing the VEP feature (key)
+                and the derivative features that originate from said VEP feature (value).
+                The property "drop" is of no influence here.
         """
         processor = ManualVEPProcessor()
-        processed_data = processor.process(dataset=loaded_data)
-        return processed_data
+        processed_data = processor.process(loaded_data, process_features)
+        processed_features = processor.get_feature_processes()
+        # No validation, since that is specific to predict.
+        # Also predict doesn't technically need processed_features, but within predict the first
+        # argument in the tuple can just be indexed.
+        # Still returning both is relevant, in case we want to validate the processed_features in
+        # the future for predict.
+        return processed_data, processed_features
 
-    def preprocess(self, loaded_data, model_features=None):
-        """
-        Function to perform the preprocessing of the loaded data to convert
-        categorical columns.
-        :param loaded_data: Pandas dataframe of the imputed CAPICE data
-        :param model_features: list (default None), a list containing all
-        the features present within a model file. When set to None,
-        PreProcessor will activate the train protocol.
-
-        Note: please adjust self.exclude_features: to include all of the
-        features that the preprocessor should NOT process.
-        Features chr_pos_ref_alt, chr and pos are hardcoded and
-        thus do not have to be included.
-        """
-        preprocessor = PreProcessor(
-            exclude_features=self.exclude_features,
-            model_features=model_features)
-        capice_data = preprocessor.preprocess(loaded_data)
-        return capice_data
+    @staticmethod
+    def categorical_process(loaded_data: pd.DataFrame,
+                            processing_features: dict[str, list[str]] | None = None,
+                            train_features: list | None = None):
+        processor = CategoricalProcessor()
+        capice_data, processed_features = processor.process(
+            loaded_data,
+            processable_features=train_features,
+            predetermined_features=processing_features
+        )
+        return capice_data, processed_features
 
-    def _export(self, dataset, output):
+    def _export(self, dataset: pd.DataFrame, output: os.PathLike):
         """
         Function to prepare the data to be exported
         """
diff --git a/src/molgenis/capice/main_predict.py b/src/molgenis/capice/main_predict.py
index 51106adf..1c376c56 100644
--- a/src/molgenis/capice/main_predict.py
+++ b/src/molgenis/capice/main_predict.py
@@ -2,13 +2,14 @@
 from molgenis.capice.utilities.enums import Column
 from molgenis.capice.utilities.predictor import Predictor
 from molgenis.capice.utilities.class_suggestor import ClassSuggestor
+from molgenis.capice.validators.predict_validator import PredictValidator
 from molgenis.capice.validators.post_vep_processing_validator import PostVEPProcessingValidator
 
 
 class CapicePredict(Main):
     """
     Predict class of CAPICE to call the different modules to impute,
-    preprocess and eventually predict a score over a CAPICE annotated file.
+    process and eventually predict a score over a CAPICE annotated file.
     """
 
     def __init__(self, input_path, model, output_path, output_given):
@@ -26,27 +27,29 @@ def run(self):
                                                                     Column.id_source.value,
                                                                     Column.feature.value,
                                                                     Column.feature_type.value])
-        capice_data = self.process(loaded_data=capice_data)
-        capice_data = self.preprocess(loaded_data=capice_data,
-                                      model_features=self.model.get_booster().feature_names)
+        capice_data = self.process(
+            loaded_data=capice_data,
+            process_features=list(self.model.vep_features.keys())
+        )[0]
+        PostVEPProcessingValidator().validate_features_present(
+            capice_data, self.model.vep_features.values()
+        )
+        capice_data = self.categorical_process(
+            loaded_data=capice_data,
+            processing_features=self.model.processable_features,
+            train_features=None
+        )[0]
         capice_data = self.predict(loaded_data=capice_data)
         capice_data = self.apply_suggested_class(predicted_data=capice_data)
         self._export(dataset=capice_data, output=self.output)
 
-    def process(self, loaded_data):
-        """
-        Function to process the VEP file to a CAPICE file
-        """
-        processed_data = super().process(loaded_data)
-        validator = PostVEPProcessingValidator(self.model)
-        validator.validate_features_present(processed_data)
-        return processed_data
-
     def predict(self, loaded_data):
         """
         Function to call the correct model to predict CAPICE scores
         :return: pandas DataFrame
         """
+        validator = PredictValidator()
+        validator.validate_data_predict_ready(loaded_data, self.model)
         predictor = Predictor(self.model)
         capice_data = predictor.predict(loaded_data)
         return capice_data
diff --git a/src/molgenis/capice/main_train.py b/src/molgenis/capice/main_train.py
index 892eff77..5b20653d 100644
--- a/src/molgenis/capice/main_train.py
+++ b/src/molgenis/capice/main_train.py
@@ -1,5 +1,6 @@
 import json
 
+import numpy as np
 import pandas as pd
 import xgboost as xgb
 from scipy import stats
@@ -7,6 +8,7 @@
 
 from molgenis.capice.main_capice import Main
 from molgenis.capice import __version__
+from molgenis.capice.utilities import check_if_in_list
 from molgenis.capice.utilities.enums import TrainEnums
 from molgenis.capice.core.capice_exporter import CapiceExporter
 
@@ -22,7 +24,7 @@ def __init__(self, input_path, json_path, test_split, output_path, output_given,
 
         # Impute JSON.
         self.json_path = json_path
-        self.log.debug('Input impute JSON confirmed: %s', self.json_path)
+        self.log.debug('Input model features JSON confirmed: %s', self.json_path)
 
         # Train test size.
         self.train_test_size = test_split
@@ -45,7 +47,7 @@ def __init__(self, input_path, json_path, test_split, output_path, output_given,
         self.random_state = 45
         self.split_random_state = 4
         self.model_random_state = 0
-        self.processed_features = []
+        self.train_features = []
         self.loglevel = self.manager.loglevel
         self.exporter = CapiceExporter(file_path=self.output, output_given=self.output_given)
 
@@ -55,37 +57,97 @@ def run(self):
         order to create new CAPICE models.
         """
         data = self._load_file(additional_required_features=self.additional_required)
-        data = self.process(loaded_data=data)
         with open(self.json_path, 'rt') as impute_values_file:
-            json_dict = json.load(impute_values_file)
-        self._validate_impute_complete(data, json_dict)
+            train_features = list(json.load(impute_values_file).keys())
+
+        self._validate_train_features_duplicates(train_features)
+
+        self._validate_features_present(data, train_features)
+
+        data, vep_processed = self.process(
+            loaded_data=data,
+            process_features=train_features
+        )
+
+        processable_features = self._reset_processing_features(
+            train_features,
+            vep_processed,
+            data.columns
+        )
+
+        processed_data, processed_features = self.categorical_process(
+            loaded_data=data,
+            train_features=processable_features,
+            processing_features=None
+        )
+
+        self._set_train_features(processable_features, processed_features)
 
-        processed_data = self.preprocess(loaded_data=data)
-        self._get_processed_features(dataset=processed_data, impute_keys=json_dict.keys())
         processed_train, processed_test = self.split_data(dataset=processed_data,
                                                           test_size=self.train_test_size)
         model = self.train(test_set=processed_test, train_set=processed_train)
-        setattr(model, "model_features", list(json_dict.keys()))
+        setattr(model, "vep_features", vep_processed)
+        setattr(model, "processable_features", processed_features)
         setattr(model, 'CAPICE_version', __version__)
         self.exporter.export_capice_model(model=model)
 
-    def _validate_impute_complete(self, dataset, json_dict):
-        """
-
-        :param pd.DataFrame dataset:
-        :param dict json_dict:
-        :return:
-        """
+    def _validate_features_present(self, dataset, train_features) -> None:
         missing = []
-        for key in json_dict.keys():
+        for key in train_features:
             if key not in dataset.columns:
                 missing.append(key)
 
         if len(missing) > 0:
-            error_message = 'Impute file missing needed columns for input file: %s'
+            error_message = 'Train features file missing needed columns for input file: %s'
             self.log.critical(error_message, missing)
             raise ValueError(error_message % missing)
 
+    def _validate_train_features_duplicates(self, input_train_features: list):
+        values, counts = np.unique(input_train_features, return_counts=True)
+        if counts[counts > 1].any():
+            error_message = 'Detected duplicate features in user supplied train features: %s'
+            duplicates = ', '.join(values[counts > 1])
+            self.log.critical(error_message, duplicates)
+            raise KeyError(error_message % duplicates)
+
+    @staticmethod
+    def _reset_processing_features(
+            input_train_features: list,
+            vep_processed: dict,
+            vep_processed_dataframe_columns: pd.DataFrame.columns
+    ) -> list[str]:
+        feature_list = []
+        # Adds the VEP input features to which the processor has property drop = False
+        for feature in vep_processed.keys():
+            if feature in vep_processed_dataframe_columns:
+                feature_list.append(feature)
+        # Adds back the user input features, but avoiding adding duplicates and
+        # avoiding the features that had property drop = True
+        for feature in input_train_features:
+            if feature not in feature_list and feature not in vep_processed.keys():
+                feature_list.append(feature)
+        # Extending the features with the VEP processors output features
+        # Has to be new list otherwise features from feature_list go missing
+        return_list = check_if_in_list(vep_processed.values(), feature_list)
+        # Merging back with feature_list
+        return_list.extend(feature_list)
+        return return_list
+
+    def _set_train_features(self, processable_features: list, processed_features: dict) -> \
+            None:
+        train_features = []
+        for feature in processable_features:
+            if feature not in processed_features.keys():
+                train_features.append(feature)
+        for feature_name, features in processed_features.items():
+            for feature in features:
+                train_features.append(f'{feature_name}_{feature}')
+        self.log.info(
+            'The following features have been selected for training: %s',
+            ', '.join(train_features)
+        )
+        self.train_features = train_features
+
     def split_data(self, dataset, test_size: float):
         """
         Function to split any given dataset into 2 datasets using the test_size
@@ -99,23 +161,6 @@ def split_data(self, dataset, test_size: float):
                                        random_state=self.split_random_state)
         return train, test
 
-    def _get_processed_features(self, dataset: pd.DataFrame, impute_keys):
-        """
-        Function to save the columns of a dataset that have been processed and
-        thus are an output column of the CADD annotation.
-        :param dataset: pandas.DataFrame
-        """
-        for column in dataset.columns:
-            for feature in impute_keys:
-                if (column == feature or column.startswith(feature)) and \
-                        column not in self.processed_features:
-                    self.processed_features.append(column)
-        self.log.info(
-            'The following features have been selected for training: %s', ', '.join(
-                self.processed_features
-            )
-        )
-
     def _set_verbosity_from_log_level(self):
         """
         Uses loglevel to set verbosity and xg boost verbosity
@@ -141,7 +186,7 @@ def _create_eval_set(self, xgb_version, test_set):
         :return: a list with tuple with pandas Dataframe, pandas Series and possibly "test"
         eval_set
         """
-        eval_data = [test_set[self.processed_features],
+        eval_data = [test_set[self.train_features],
                      test_set[TrainEnums.binarized_label.value]]
         if int(xgb_version.split('.')[0]) < 1:
             eval_data.append('test')
@@ -200,7 +245,7 @@ def train(self, test_set: pd.DataFrame, train_set: pd.DataFrame):
         eval_set = self._create_eval_set(xgb.__version__, test_set)
 
         self.log.info('Random search starting, please hold.')
-        randomised_search_cv.fit(train_set[self.processed_features],
+        randomised_search_cv.fit(train_set[self.train_features],
                                  train_set[TrainEnums.binarized_label.value],
                                  eval_set=eval_set,
                                  verbose=xgb_verbosity,
diff --git a/src/molgenis/capice/utilities/__init__.py b/src/molgenis/capice/utilities/__init__.py
index 605e46f2..71fd86b7 100644
--- a/src/molgenis/capice/utilities/__init__.py
+++ b/src/molgenis/capice/utilities/__init__.py
@@ -1,6 +1,7 @@
 import functools
 import warnings
 from pathlib import Path
+from collections.abc import Iterable
 
 
 def get_project_root_dir():
@@ -23,3 +24,31 @@ def new_func(*args, **kwargs):
         return func(*args, **kwargs)
 
     return new_func
+
+
+def check_if_in_list(list_of_lists: list[list[object]], to_check_list: Iterable):
+    """
+    Checks if the item within a list within a list of object value (can be int, str, float,
+    etc.) is within the to_check_list. If False: add to return list. If True: do not add to
+    return list.
+
+    Args:
+        list_of_lists:
+            List containing lists of values (object).
+            These values are each independently checked if they are within the to_check_list.
+            If False: add to return list. If true: do not add to return list.
+        to_check_list:
+            Iterable over which the individual items of the list_of_lists should be checked.
+
+    Returns:
+        list:
+            A single list containing all individual items of list_of_lists that did not occur in
+            to_check_list.
+
+    """
+    return_list = []
+    for items in list_of_lists:
+        for item in items:
+            if item not in to_check_list:
+                return_list.append(item)
+    return return_list
diff --git a/src/molgenis/capice/utilities/categorical_processor.py b/src/molgenis/capice/utilities/categorical_processor.py
new file mode 100644
index 00000000..a9d8f85b
--- /dev/null
+++ b/src/molgenis/capice/utilities/categorical_processor.py
@@ -0,0 +1,174 @@
+import numpy as np
+import pandas as pd
+
+from molgenis.capice.core.logger import Logger
+from molgenis.capice.core.capice_manager import CapiceManager
+from molgenis.capice.utilities.column_utils import ColumnUtils
+from molgenis.capice.utilities.enums import Column, UniqueSeparator
+
+
+class CategoricalProcessor:
+    """
+    Class to process the data before predicting or training to separate
+    categorical columns.
+    """
+
+    def __init__(self):
+        self.log = Logger().logger
+        self.manager = CapiceManager()
+
+    def process(
+            self,
+            dataset: pd.DataFrame,
+            processable_features: list[str] | None = None,
+            predetermined_features: dict[str, list] | None = None
+    ) -> tuple[pd.DataFrame, dict[str, list]]:
+        """
+        Callable method of CategoricalProcessor to start processing the categorical columns
+        of dataset according to either processable_features (in case of train) or
+        predetermined_features (in case of predict).
+
+        Args:
+            dataset:
+                The dataset over which the categorical features should be processed.
+            processable_features:
+                list of processable train features that this module limits to.
+                If used in predict: set to None.
+            predetermined_features:
+                Dictionary of features as they are put into the CategoricalProcessor (key) and
+                the features they should end up being (values).
+                If used in train: set to None.
+        Returns:
+            processed_dataset:
+                Dataset with all "Other" dtype columns processed to new columns that have either
+                0 or 1 depending on if that sample had the categorical feature or not.
+            processing_features:
+                Dictionary containing the columns (key) and the columns they turned into as
+                output (values).
+        Raises:
+            ValueError:
+                ValueError is raised when both processable_features and predetermined_features
+                are set to None.
+        """
+        self.log.info('Starting processing categorical columns.')
+        self._validate_one_feature_list_present(processable_features, predetermined_features)
+        self._create_preservation_col(dataset)
+        if predetermined_features is None:
+            # Type ignore, else mypy takes issue with Typing since processable_features can be
+            # None, so it is considered Optional[list[str]] instead of list[str].
+            processing_features = self._get_categorical_columns(
+                dataset,
+                processable_features  # type: ignore
+            )
+        else:
+            processing_features = predetermined_features
+
+        processed_dataset = self._get_dummies(dataset, processing_features)
+
+        self._ensure_columns_present(processed_dataset, processing_features)
+
+        self.log.info('Successfully processed categorical data.')
+        return processed_dataset, processing_features
+
+    def _validate_one_feature_list_present(
+            self,
+            processable_features: list | None,
+            predetermined_features: dict[str, list] | None
+    ) -> None:
+        if processable_features is None and predetermined_features is None:
+            error_message = 'Both processable_features and predetermined_features are not supplied!'
+            self.log.critical(error_message)
+            raise ValueError(error_message)
+
+    @staticmethod
+    def _create_preservation_col(dataset: pd.DataFrame) -> None:
+        """
+        Function to create the chr_pos_ref_alt column so that it doesn't get
+        lost in preprocessing.
+        :param dataset: unprocessed pandas DataFrame
+        :return: unprocessed pandas DataFrame
+            containing column 'chr_pos_ref_alt'
+        """
+        dataset[Column.chr_pos_ref_alt.value] = dataset[
+            [Column.chr.value, Column.pos.value, Column.ref.value, Column.alt.value]
+        ].astype(str).agg(UniqueSeparator.unique_separator.value.join, axis=1)
+
+    def _get_categorical_columns(self, dataset: pd.DataFrame,
+                                 processable_features: list[str]) -> dict[str, list]:
+        """
+        Method for when the predetermined_features is None, usually in case of train,
+        to determine the top 5 features that should be used for pandas.get_dummies().
+        Loops through the "Object" dtype features in dataset, and if the feature is in the
+        processable_features list it will obtain the most common values of the feature.
+        """
+        processing_features = {}
+        for feature in dataset.select_dtypes(include=["O"]).columns:
+            if feature in processable_features:
+                self.log.debug('Converting the categorical column: %s', feature)
+                processing_features[feature] = self._process_object(dataset[feature])
+        return processing_features
+
+    def _process_object(self, feature_column: pd.Series) -> list:
+        """
+        Method to call the top X categories method with return number 5
+        """
+        top_categories = self._get_top_n_cats(feature_column, return_num=5)
+        return top_categories
+
+    def _get_top_n_cats(self, column: pd.Series, return_num: int) -> list:
+        """
+        Function for when a training file is preprocessed to get the top
+        return_num quantity values within a categorical column.
+        Some converting is done for the logger to be able to print them.
+        :param column: pandas Series
+        :param return_num: integer
+        :return: pandas Series
+        """
+        counts = column.value_counts().index
+        value_counts = list(counts[:return_num])
+        if len(counts) > return_num:
+            value_counts.append(Column.other.value)
+        message = 'For feature: %s saved the following values: %s'
+        self.log.info(message, column.name, ', '.join(value_counts))
+        return value_counts
+
+    def _get_dummies(self, dataset: pd.DataFrame, processing_features: dict) -> pd.DataFrame:
+        """
+        Method to call the pandas.get_dummies() method to convert categorical into multiple columns
+        of 0 and 1.
+        """
+        for feature, feature_values in processing_features.items():
+            self._set_other_values(dataset, feature, feature_values)
+        processed_dataset = pd.get_dummies(dataset, columns=processing_features.keys())
+        return processed_dataset
+
+    def _set_other_values(self, dataset: pd.DataFrame, feature_name: str, feature_values: list) -> \
+            None:
+        """
+        Method to set all other entries within a column to "other" if they are not in
+        feature_values list.
+        """
+        self.log.debug(
+            'Converting %d features for feature: %s : %s',
+            len(feature_values),
+            feature_name,
+            ", ".join(feature_values)
+        )
+        dataset[feature_name] = np.where(
+            dataset[feature_name].isin(feature_values),
+            dataset[feature_name],
+            Column.other.value
+        )
+
+    def _ensure_columns_present(self, dataset: pd.DataFrame, categorical_out_columns: dict) -> None:
+        merged_columns = []
+        for main_feature, derivative_features in categorical_out_columns.items():
+            for feature in derivative_features:
+                merged_columns.append(f'{main_feature}_{feature}')
+        column_utils = ColumnUtils()
+        column_utils.set_specified_columns(merged_columns)
+        missing = column_utils.get_missing_diff_with(dataset.columns)
+        for feature in missing:
+            message = 'Detected column %s not present in columns. Adding full column on NaN'
+            self.log.debug(message, feature)
+            dataset[feature] = np.nan
diff --git a/src/molgenis/capice/utilities/dynamic_loader.py b/src/molgenis/capice/utilities/dynamic_loader.py
index cfa98da6..3f707510 100644
--- a/src/molgenis/capice/utilities/dynamic_loader.py
+++ b/src/molgenis/capice/utilities/dynamic_loader.py
@@ -21,7 +21,7 @@ def __init__(self, required_attributes: list, path):
         self.path = path
         self._check_dir_exists()
         self.required_attributes = required_attributes
-        self.modules = {}
+        self.modules: dict[str, object] = {}
 
     def load_manual_annotators(self):
         """
@@ -82,14 +82,15 @@ def _load_modules_from_path(path):
                 modules.append(module)
         return modules
 
-    def _import(self, usable_modules: list):
+    def _import(self, usable_modules: list[str]) -> dict[str, object]:
         """
         Function  to dynamically load in the modules using the
         import_module library.
         :param usable_modules: list of absolute paths to potential modules
         :return: list of usable modules
         """
-        return_modules = {}
+        # For some reason, mypy wants this line to be Typed instead of the method.
+        return_modules: dict[str, object] = {}
         for module in usable_modules:
             name = os.path.basename(module).split('.py')[0]
             spec = util.spec_from_file_location(name=name, location=module)
diff --git a/src/molgenis/capice/utilities/enums.py b/src/molgenis/capice/utilities/enums.py
index 88244e64..1cb84d22 100644
--- a/src/molgenis/capice/utilities/enums.py
+++ b/src/molgenis/capice/utilities/enums.py
@@ -8,8 +8,8 @@ class Column(Enum):
     chr_pos_ref_alt = 'chr_pos_ref_alt'
     chr = 'chr'
     pos = 'pos'
-    ref = 'ref'
-    alt = 'alt'
+    ref = 'REF'
+    alt = 'ALT'
     gene_name = 'gene_name'
     gene_id = 'gene_id'
     id_source = 'id_source'
@@ -17,6 +17,7 @@ class Column(Enum):
     feature_type = 'feature_type'
     score = 'score'
     suggested_class = 'suggested_class'
+    other = 'other_CAPICE_value'
 
 
 class OutputClasses(Enum):
diff --git a/src/molgenis/capice/utilities/load_file_postprocessor.py b/src/molgenis/capice/utilities/load_file_postprocessor.py
index 9f3b5366..251026a8 100644
--- a/src/molgenis/capice/utilities/load_file_postprocessor.py
+++ b/src/molgenis/capice/utilities/load_file_postprocessor.py
@@ -33,8 +33,6 @@ def _col_renamer(self):
         self.dataset.rename(
             columns={'CHROM': Column.chr.value,
                      'POS': Column.pos.value,
-                     'REF': Column.ref.value,
-                     'ALT': Column.alt.value,
                      'Gene': Column.gene_id.value,
                      'SYMBOL_SOURCE': Column.id_source.value,
                      'Feature': Column.feature.value,
diff --git a/src/molgenis/capice/utilities/manual_vep_processor.py b/src/molgenis/capice/utilities/manual_vep_processor.py
index 45cc8487..b168cdd7 100644
--- a/src/molgenis/capice/utilities/manual_vep_processor.py
+++ b/src/molgenis/capice/utilities/manual_vep_processor.py
@@ -15,22 +15,34 @@ class ManualVEPProcessor:
 
     def __init__(self):
         self.log = Logger().logger
+        self.feature_processing_tracker = {}
 
-    def process(self, dataset: pd.DataFrame):
+    def process(self, dataset: pd.DataFrame, process_features: list[str]) -> pd.DataFrame:
         """
         Callable method for the ManualVEPProcessor to start processing.
         Loads all the VEP processors dynamically from /src/main/python/vep.
-        :param dataset: pandas.DataFrame: loaded pandas dataframe of the user
-        provided input TSV.
-        :return: pandas.DataFrame: dataframe with processed features
+
+        Args:
+            dataset: The input dataset over which the VEP features should be processed.
+            process_features: A collection of all input features that should be used in either
+                              training or predicting over which VEP processing should happen.
+
+        Returns:
+            pandas.DataFrame: The input dataset, processed on the consequences
+
         """
         self.log.info('Starting manual VEP feature processing.')
         vep_annotators = self._load_vep_processors()
         dropping_columns = []
         n_feats_processed = 0
         for processor in vep_annotators:
-            if processor.name in dataset.columns and processor.usable:
+            if (
+                    processor.name in dataset.columns and
+                    processor.name in process_features and
+                    processor.usable
+            ):
                 self.log.debug('Processing: %s', processor.name)
+                self._add_feature_tracking(processor.name, processor.columns)
                 dataset = processor.process(dataset)
                 if processor.drop and processor.name not in dropping_columns:
                     dropping_columns.append(processor.name)
@@ -43,6 +55,22 @@ def process(self, dataset: pd.DataFrame):
         self.log.debug('Processed %d features.', n_feats_processed)
         return dataset
 
+    def _add_feature_tracking(self, processor_name: str, processor_features: list[str]):
+        if processor_name not in self.feature_processing_tracker.keys():
+            self.feature_processing_tracker[processor_name] = processor_features
+        else:
+            self.feature_processing_tracker[processor_name].extend(processor_features)
+
+    def get_feature_processes(self) -> dict[str, list[str]]:
+        """
+        Getter for the dictionary containing all the processed features and their output features.
+
+        Returns:
+            dict:
+                Input VEP processing features (key) and their output features (values)
+        """
+        return self.feature_processing_tracker
+
     def _load_vep_processors(self):
         location = os.path.join(get_project_root_dir(), 'vep')
         self.log.debug('Loading modules at %s', location)
diff --git a/src/molgenis/capice/utilities/preprocessor.py b/src/molgenis/capice/utilities/preprocessor.py
deleted file mode 100644
index 42bd45a9..00000000
--- a/src/molgenis/capice/utilities/preprocessor.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import numpy as np
-import pandas as pd
-
-from molgenis.capice.core.logger import Logger
-from molgenis.capice.core.capice_manager import CapiceManager
-from molgenis.capice.utilities.enums import Column, UniqueSeparator
-from molgenis.capice.utilities.column_utils import ColumnUtils
-
-
-class PreProcessor:
-    """
-    Class to preprocess the data before predicting or training to separate
-    categorical columns.
-    """
-
-    def __init__(self, exclude_features: list, model_features: list = None):
-        """
-        :param exclude_features: list,
-            all the features that the preprocessor should not process.
-        Features that are already excluded include:
-            chr_pos_ref_alt, chr and pos.
-        :param model_features: list (default None), a list containing all
-        the features present within a model file.
-        """
-        self.log = Logger().logger
-        self.manager = CapiceManager()
-        self.log.info('Preprocessor started.')
-        self.train = False
-        self.exclude_features = [
-            Column.chr_pos_ref_alt.value,
-            Column.chr.value,
-            Column.pos.value
-        ]
-        self.exclude_features += exclude_features
-        self.model_features = model_features
-        self.objects = []
-
-    def _is_train(self):
-        if self.model_features is None:
-            self.train = True
-
-    def preprocess(self, dataset: pd.DataFrame):
-        """
-        Callable function for the preprocessor to start preprocessing.
-        :param dataset: unprocessed pandas DataFrame
-        :return: processed pandas Dataframe
-        """
-        self._is_train()
-        dataset = self._create_preservation_col(dataset)
-        self._get_categorical_columns(dataset)
-        processed_dataset = self._process_objects(dataset)
-        if not self.train:
-            processed_dataset = self._ensure_columns_present(processed_dataset)
-        self.log.info('Successfully preprocessed data.')
-        return processed_dataset
-
-    @staticmethod
-    def _create_preservation_col(dataset):
-        """
-        Function to create the chr_pos_ref_alt column so that it doesn't get
-        lost in preprocessing.
-        :param dataset: unprocessed pandas DataFrame
-        :return: unprocessed pandas DataFrame
-            containing column 'chr_pos_ref_alt'
-        """
-        dataset[Column.chr_pos_ref_alt.value] = dataset[
-            [Column.chr.value, Column.pos.value, Column.ref.value, Column.alt.value]
-        ].astype(str).agg(UniqueSeparator.unique_separator.value.join, axis=1)
-        return dataset
-
-    def _get_categorical_columns(self, dataset: pd.DataFrame):
-        """
-        Function to get the categorical columns that are within the supplied
-        annotation features of the imputing file.
-        :param dataset: pandas DataFrame
-        """
-        for feature in dataset.select_dtypes(include=["O"]).columns:
-            if feature not in self.exclude_features:
-                self.objects.append(feature)
-        self.log.debug('Converting the categorical columns: %s.', ', '.join(self.objects))
-
-    def _process_objects(self, dataset: pd.DataFrame):
-        """
-        (If train) will create a dictionary telling the processor how many
-        categories are within a certain column.
-        If not train: Will look up each annotation feature from the impute file
-        within the columns of the datafile (either in full name or the column
-        starts with the feature from the impute file).
-        This dictionary is then passed to the actual processor.
-        :param dataset: unprocessed pandas DataFrame
-        :return: processed pandas DataFrame
-        """
-        annotation_feats_dict = {}
-        if self.train:
-            hardcoded_features = [Column.ref.value, Column.alt.value]
-            for feature in hardcoded_features:
-                annotation_feats_dict[feature] = 5
-            self.log.info('Training protocol, creating new categorical conversion identifiers.')
-            for feat in self.objects:
-                if feat not in annotation_feats_dict.keys():
-                    annotation_feats_dict[feat] = 5
-        else:
-            for feature in self.objects:
-                annotation_feats_dict = self._process_objects_no_train(
-                    feature=feature,
-                    annotation_features_dict=annotation_feats_dict
-                )
-        processed_data = self._process_categorical_vars(
-            dataset=dataset,
-            annotation_feats_dict=annotation_feats_dict
-        )
-        return processed_data
-
-    def _process_objects_no_train(self, feature: str, annotation_features_dict: dict):
-        for model_feature in self.model_features:
-            if model_feature.startswith(feature):
-                extension = model_feature.split(''.join([feature, '_']))[-1]
-                if feature in annotation_features_dict.keys():
-                    annotation_features_dict[feature].append(extension)
-                else:
-                    annotation_features_dict[feature] = [extension]
-        return annotation_features_dict
-
-    def _process_categorical_vars(self, dataset: pd.DataFrame, annotation_feats_dict: dict):
-        """
-        Processor of categorical columns. Will create new columns based on the
-        quantity of a value within a column.
-        :param dataset: unprocessed pandas DataFrame
-        :param annotation_feats_dict:
-            dictionary that is to contain the levels for each categorical
-            feature
-        :return: processed pandas DataFrame
-        """
-        if self.train:
-            for annotation_feature in annotation_feats_dict.keys():
-                feature_names = self._get_top_n_cats(
-                    column=dataset[annotation_feature],
-                    return_num=annotation_feats_dict[annotation_feature]
-                )
-                dataset[annotation_feature] = np.where(
-                    dataset[annotation_feature].isin(feature_names),
-                    dataset[annotation_feature],
-                    'other'
-                )
-        else:
-            for annotation_feature in annotation_feats_dict.keys():
-                feature_names = annotation_feats_dict[annotation_feature]
-                self.log.debug('For feature: %s loaded %s levels: %s',
-                               annotation_feature,
-                               len(feature_names),
-                               feature_names
-                               )
-                dataset[annotation_feature] = np.where(
-                    dataset[annotation_feature].isin(feature_names),
-                    dataset[annotation_feature],
-                    'other'
-                )
-        dataset = pd.get_dummies(dataset, columns=list(annotation_feats_dict.keys()))
-
-        return dataset
-
-    def _get_top_n_cats(self, column: pd.Series, return_num: int):
-        """
-        Function for when a training file is preprocessed to get the top
-        return_num quantity values within a categorical column.
-        Some converting is done for the logger to be able to print them.
-        :param column: pandas Series
-        :param return_num: integer
-        :return: pandas Series
-        """
-        value_counts = column.value_counts().index[:return_num].values
-        printable_value_counts = []
-        for value in value_counts:
-            if not isinstance(value, str):
-                value = str(value)
-            printable_value_counts.append(value)
-        message = 'For feature: %s saved the following values: %s'
-        self.log.info(message, column.name, ', '.join(printable_value_counts))
-        return value_counts
-
-    def _ensure_columns_present(self, dataset):
-        """
-        Function to ensure that for the prediction all prediction columns
-        are present. If a columns is not present, add it with a full
-        columns of NaN.
-        """
-        column_utils = ColumnUtils()
-        column_utils.set_specified_columns(self.model_features)
-        missing = column_utils.get_missing_diff_with(dataset.columns)
-        for feature in missing:
-            message = 'Detected column %s not present in columns. Adding full column of NaN'
-            self.log.debug(message, feature)
-            dataset[feature] = np.nan
-        return dataset
diff --git a/src/molgenis/capice/validators/input_validator.py b/src/molgenis/capice/validators/input_validator.py
index 9c4051bc..0078f195 100644
--- a/src/molgenis/capice/validators/input_validator.py
+++ b/src/molgenis/capice/validators/input_validator.py
@@ -16,7 +16,7 @@ def validate_input_path(input_path: os.PathLike, extension: tuple[str]):
         """
         if not os.path.exists(input_path):
             raise FileNotFoundError(f'{input_path} does not exist!')
-        if not (input_path.endswith(extension)):
+        if not str(input_path).endswith(extension):
             raise IOError(f'{input_path} does not match required extension: '
                           f'{", ".join(extension)}')
 
diff --git a/src/molgenis/capice/validators/model_validator.py b/src/molgenis/capice/validators/model_validator.py
index 3a3934f3..bb641641 100644
--- a/src/molgenis/capice/validators/model_validator.py
+++ b/src/molgenis/capice/validators/model_validator.py
@@ -5,7 +5,8 @@ def validate_has_required_attributes(model):
         Function to validate if the required attributes CAPICE_version,
         impute_values and predict_proba are present.
         """
-        required_attributes = ['CAPICE_version', 'model_features', 'predict_proba']
+        required_attributes = ['CAPICE_version', 'vep_features',
+                               'processable_features', 'predict_proba']
         for attribute in required_attributes:
             if attribute not in dir(model):
                 raise AttributeError(f'Unable to locate attribute {attribute} in model file!')
diff --git a/src/molgenis/capice/validators/post_file_parse_validator.py b/src/molgenis/capice/validators/post_file_parse_validator.py
index 78fdab67..27ef95e7 100644
--- a/src/molgenis/capice/validators/post_file_parse_validator.py
+++ b/src/molgenis/capice/validators/post_file_parse_validator.py
@@ -14,7 +14,7 @@ def validate_n_columns(self, dataset):
         Validator to make sure that at least 4 columns are loaded
         (chr, pos, ref, alt). Does NOT check for the names of these columns!
         """
-        if isinstance(dataset, pd.Series) or not dataset.shape[1] >= 4:
+        if isinstance(dataset, pd.Series) or dataset.shape[1] < 4:
             error_message = 'Loaded dataset does NOT have enough features! ' \
                             'Is there a header present that does not start ' \
                             'with ##?'
@@ -31,7 +31,7 @@ def validate_variants_present(self, dataset):
             raise ValueError(error_message)
 
     def validate_minimally_required_columns(
-            self, dataset, additional_required_features: list = None
+            self, dataset, additional_required_features: list | None = None
     ):
         """
         Validator for both predict and train to check if the very least columns
diff --git a/src/molgenis/capice/validators/post_vep_processing_validator.py b/src/molgenis/capice/validators/post_vep_processing_validator.py
index 738fdf8e..712dfec7 100644
--- a/src/molgenis/capice/validators/post_vep_processing_validator.py
+++ b/src/molgenis/capice/validators/post_vep_processing_validator.py
@@ -1,20 +1,29 @@
+import pandas as pd
+
 from molgenis.capice.core.logger import Logger
-from molgenis.capice.utilities.column_utils import ColumnUtils
+from molgenis.capice.utilities import check_if_in_list
 
 
 class PostVEPProcessingValidator:
-    def __init__(self, model):
-        self.model = model
+    def __init__(self):
         self.log = Logger().logger
 
-    def validate_features_present(self, datafile):
+    def validate_features_present(self, datafile: pd.DataFrame, vep_features: list[list[str]]) -> \
+            None:
         """
-        Validator to see if all features within the model impute values are
-        presently processed.
+        Validator to see if all features that should be present after the
+        ManualVEPProcessor are present.
+        Args:
+            datafile:
+                Pandas Dataframe over which the feature presence validation should happen.
+            vep_features:
+                List of lists of expected output ManualVEPProcesing features as saved in the
+                model.vep_features.values()
+        Raises:
+            KeyError:
+                Raises KeyError when output VEP feature is not present within datafile.
         """
-        column_utils = ColumnUtils()
-        column_utils.set_specified_columns(self.model.model_features)
-        features_not_present = column_utils.get_missing_diff_with(datafile.columns)
+        features_not_present = check_if_in_list(vep_features, datafile.columns)
         if len(features_not_present) > 0:
             error_message = 'Detected required feature(s) %s not ' \
                             'present within VEP processed input file!'
diff --git a/src/molgenis/capice/validators/predict_validator.py b/src/molgenis/capice/validators/predict_validator.py
new file mode 100644
index 00000000..18c8a74e
--- /dev/null
+++ b/src/molgenis/capice/validators/predict_validator.py
@@ -0,0 +1,32 @@
+import pandas as pd
+import xgboost as xgb
+
+from molgenis.capice.core.logger import Logger
+
+
+class PredictValidator:
+    def __init__(self):
+        self.log = Logger().logger
+
+    def validate_data_predict_ready(self, dataset: pd.DataFrame, model: xgb.XGBClassifier) ->\
+            None:
+        """
+        Validates if dataset is predict ready according to the feature names in model
+
+        Args:
+            dataset:
+                The dataset that is supposed to be predict ready.
+            model:
+                The custom CAPICE xgboost.XGBClassifier.
+        Raises:
+            KeyError:
+                Raised when a required predict feature is missing from dataset.
+        """
+        missing = []
+        for feature in model.get_booster().feature_names:  # type: ignore
+            if feature not in dataset.columns:
+                missing.append(feature)
+        if len(missing) > 0:
+            error_message = 'Missing required predict column(s): %s'
+            self.log.critical(error_message, ', '.join(missing))
+            raise KeyError(error_message, ', '.join(missing))
diff --git a/src/molgenis/capice/validators/property_type_validator.py b/src/molgenis/capice/validators/property_type_validator.py
index 1a1271e8..615682c5 100644
--- a/src/molgenis/capice/validators/property_type_validator.py
+++ b/src/molgenis/capice/validators/property_type_validator.py
@@ -1,5 +1,5 @@
 class PropertyTypeValidator:
-    def validate_property(self, value: any, expected_type: any, include_none: bool = False):
+    def validate_property(self, value: object, expected_type: type, include_none: bool = False):
         """
         Logger method to raise a TypeError when a Property is not set correctly.
 
diff --git a/src/molgenis/capice/validators/version_validator.py b/src/molgenis/capice/validators/version_validator.py
index 958f2053..55e0b1d0 100644
--- a/src/molgenis/capice/validators/version_validator.py
+++ b/src/molgenis/capice/validators/version_validator.py
@@ -79,16 +79,18 @@ def validate_versions_compatible(self, capice_version: str, model_version: str):
         ValueError
             Raised when the model and framework versions are not compatible.
         """
+        # All mypy ignores here are because attributes are not found.
         capice = match(self.regex, capice_version)
         model = match(self.regex, model_version)
-        if capice.group('major') != model.group('major'):
+        if capice.group('major') != model.group('major'):  # type: ignore
             raise ValueError(
-                f'CAPICE major version {capice.string} does not match with the model '
-                f'{model.string}!'
+                f'CAPICE major version {capice.string} '  # type: ignore
+                f'does not match with the model '
+                f'{model.string}!'  # type: ignore
             )
 
-        if capice.group('prerelease') or model.group('prerelease'):
-            self._validate_prerelease(capice, model)
+        if capice.group('prerelease') or model.group('prerelease'):  # type: ignore
+            self._validate_prerelease(capice, model)  # type: ignore
 
     @staticmethod
     def _validate_prerelease(capice_version: re.Match,
diff --git a/src/molgenis/capice/vep/amino_acids.py b/src/molgenis/capice/vep/amino_acids.py
index a1a3289f..c2c987f6 100644
--- a/src/molgenis/capice/vep/amino_acids.py
+++ b/src/molgenis/capice/vep/amino_acids.py
@@ -23,6 +23,10 @@ def naa(self):
         return self.columns[1]
 
     def _process(self, dataframe: pd.DataFrame):
-        dataframe[self.columns] = dataframe[self.name].str.split('/', expand=True)
-        dataframe[self.naa].fillna(dataframe[self.oaa], inplace=True)
+        if dataframe[self.name].str.contains('/', regex=False).any():
+            dataframe[self.columns] = dataframe[self.name].str.split('/', expand=True)
+            dataframe[self.naa].fillna(dataframe[self.oaa], inplace=True)
+        else:
+            dataframe[self.oaa] = dataframe[self.name]
+            dataframe[self.naa] = dataframe[self.oaa]
         return dataframe
diff --git a/src/molgenis/capice/vep/consequence.py b/src/molgenis/capice/vep/consequence.py
index c3a780fc..4a0d530e 100644
--- a/src/molgenis/capice/vep/consequence.py
+++ b/src/molgenis/capice/vep/consequence.py
@@ -60,6 +60,10 @@ def columns(self):
                 'is_splice_polypyrimidine_tract_variant'
                 ]
 
+    @staticmethod
+    def _fillna():
+        return 0
+
     def _process(self, dataframe: pd.DataFrame):
         splitted_consequence = dataframe[self.name].str.split('&', expand=True)
         raw_consequences = []
@@ -69,6 +73,7 @@ def _process(self, dataframe: pd.DataFrame):
                 np.isin(splitted_consequence, current_consequence).any(axis=1), 1, 0
             )
             raw_consequences.append(current_consequence)
+
         self._validate_consequences(splitted_consequence, raw_consequences)
         return dataframe
 
diff --git a/src/molgenis/capice/vep/template.py b/src/molgenis/capice/vep/template.py
index ca08b0ae..1f3834a9 100644
--- a/src/molgenis/capice/vep/template.py
+++ b/src/molgenis/capice/vep/template.py
@@ -39,9 +39,13 @@ def usable(self, value=False):
     def drop(self):
         return True
 
+    @staticmethod
+    def _fillna():
+        return np.nan
+
     def process(self, dataframe: pd.DataFrame):
         if dataframe[self.name].isnull().all():
-            dataframe[self.columns] = np.nan
+            dataframe[self.columns] = self._fillna()
             return dataframe
         else:
             return self._process(dataframe)
diff --git a/tests/capice/test_main_train.py b/tests/capice/test_main_train.py
index c63228f0..73ad8284 100644
--- a/tests/capice/test_main_train.py
+++ b/tests/capice/test_main_train.py
@@ -42,6 +42,67 @@ def setUp(self):
         self.main.cross_validate = 2
         self.main.n_iterations = 2
 
+    def test_validate_train_features_duplicates_fail(self):
+        test_features = ['foo', 'bar', 'baz', 'foo']
+        with self.assertRaises(KeyError) as e:
+            self.main._validate_train_features_duplicates(test_features)
+        # Double quotes since KeyError still adds single quotes to the error.exception
+        self.assertEqual(
+            "'Detected duplicate features in user supplied train features: foo'",
+            str(e.exception)
+        )
+
+    def test_validate_train_features_duplicates_pass(self):
+        test_features = ['foo', 'bar', 'baz']
+        self.main._validate_train_features_duplicates(test_features)
+
+    def test_component_reset_train_features(self):
+        user_input = ['REF', 'Amino_acids', 'foo']
+        vep_processed = {
+            'REF': ['Type', 'Length'],
+            'Amino_acids': ['oAA', 'nAA']
+        }
+        dataset = pd.DataFrame(
+            columns=['REF', 'oAA', 'nAA', 'foo']
+        )
+        observed = self.main._reset_processing_features(
+            user_input, vep_processed, dataset.columns)
+        # Set because order is not important
+        self.assertSetEqual(set(observed), {'REF', 'oAA', 'nAA', 'foo', 'Type', 'Length'})
+
+    def test_integration_reset_train_features(self):
+        with open(self.main.json_path, 'rt') as fh:
+            user_input = list(json.load(fh).keys())
+        self.main._validate_train_features_duplicates(user_input)
+        data = self.main._load_file(additional_required_features=self.main.additional_required)
+        self.main._validate_features_present(data, user_input)
+        data_processed, vep_processed = self.main.process(data, user_input)
+        observed = self.main._reset_processing_features(
+            user_input, vep_processed, data_processed.columns
+        )
+        expected = [
+            'PolyPhenCat', 'PolyPhenVal', 'cDNApos', 'relcDNApos', 'SIFTcat', 'SIFTval',
+            'protPos', 'relProtPos', 'oAA', 'nAA', 'CDSpos', 'relCDSpos', 'REF', 'ALT',
+            'is_regulatory_region_variant', 'is_regulatory_region_ablation',
+            'is_regulatory_region_amplification', 'is_missense_variant', 'is_intron_variant',
+            'is_upstream_gene_variant', 'is_downstream_gene_variant', 'is_synonymous_variant',
+            'is_TF_binding_site_variant', 'is_splice_donor_variant', 'is_coding_sequence_variant',
+            'is_splice_region_variant', 'is_stop_gained', 'is_splice_acceptor_variant',
+            'is_splice_donor_5th_base_variant', 'is_splice_donor_region_variant',
+            'is_splice_polypyrimidine_tract_variant', 'is_frameshift_variant',
+            'is_3_prime_UTR_variant', 'is_inframe_insertion',
+            'is_inframe_deletion', 'is_5_prime_UTR_variant', 'is_start_lost',
+            'is_non_coding_transcript_exon_variant', 'is_non_coding_transcript_variant',
+            'is_TFBS_ablation', 'is_TFBS_amplification', 'is_protein_altering_variant',
+            'is_stop_lost', 'is_stop_retained_variant', 'is_transcript_ablation',
+            'is_intergenic_variant', 'is_start_retained_variant', 'is_transcript_amplification',
+            'is_incomplete_terminal_codon_variant', 'is_mature_miRNA_variant',
+            'is_NMD_transcript_variant', 'is_feature_elongation', 'is_feature_truncation',
+            'SpliceAI_pred_DP_AG', 'SpliceAI_pred_DP_AL', 'SpliceAI_pred_DP_DG',
+            'SpliceAI_pred_DP_DL', 'SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL',
+            'SpliceAI_pred_DS_DG', 'SpliceAI_pred_DS_DL', 'Type', 'Length', 'Grantham', 'phyloP']
+        self.assertSetEqual(set(observed), set(expected))
+
     def test_integration_training(self):
         """
         Integration test for the full training part of CAPICE.
@@ -114,7 +175,7 @@ def test__set_eval_set_test(self):
             "test"
         """
         processed_features = ['feat1', 'feat2']
-        self.main.processed_features = processed_features
+        self.main.train_features = processed_features
         test_set = pd.DataFrame(data={
             'binarized_label': [0, 1, 0],
             'feat1': [1, 0, 0],
@@ -136,7 +197,7 @@ def test__set_eval_set(self):
             (length should be 2, as "test" shouldn't be included)
         """
         processed_features = ['feat1', 'feat2']
-        self.main.processed_features = processed_features
+        self.main.train_features = processed_features
         test_set = pd.DataFrame(data={
             'binarized_label': [0, 1, 0],
             'feat1': [1, 0, 0],
@@ -149,51 +210,107 @@ def test__set_eval_set(self):
         pd.testing.assert_series_equal(test_set['binarized_label'], eval_set[0][1])
         self.assertEqual(2, len(eval_set[0]))
 
-    def test_processed_features(self):
-        with open(
-                os.path.join(
-                    _project_root_directory, 'tests', 'resources', 'features_test.json'
-                ), 'rt'
-        ) as fh:
-            features = json.load(fh)
-        dataset = pd.DataFrame(
+    def test_full_processed_features(self):
+        loaded_dataset = pd.DataFrame(
             {
-                'unused_feature_1': [1, 2, 3],
-                'feature_1': ['foo', 'bar', 'baz'],
-                'unused_feature_2': [3, 4, 5],
-                'feature_foobarbaz': ['bar', 'baz', 'foo'],
-                'feature_3_cat1': [10, 20, 30],
-                'feature_3_cat2': [10, 20, 30],
-                'feature_3_cat3': [10, 20, 30]
+                'REF': ['C', 'GC'],
+                'ALT': ['A', 'G'],
+                'PolyPhen': [0.1, 0.01],
+                'SIFT': [0.1, 0.01],
+                'Other_feature': ['foo', 'bar']
             }
         )
-        self.main._get_processed_features(dataset, features.keys())
+        features = ['REF', 'ALT', 'PolyPhen', 'SIFT']
+        processed_data, vep_processed = self.main.process(loaded_dataset, features)
+        resetted_features = self.main._reset_processing_features(
+            features, vep_processed, processed_data.columns)
         self.assertSetEqual(
-            {'feature_1',
-             'feature_foobarbaz',
-             'feature_3_cat1',
-             'feature_3_cat2',
-             'feature_3_cat3'},
-            set(self.main.processed_features)
+            {'REF', 'ALT', 'Length', 'Type', 'PolyPhenVal', 'PolyPhenCat', 'SIFTval', 'SIFTcat'},
+            set(resetted_features)
         )
 
-    def test_full_processed_features(self):
-        loaded_dataset = pd.DataFrame(
+    def test_component_feature_selection(self):
+        test_case = pd.DataFrame(
             {
-                'ref': ['C', 'GC'],
-                'alt': ['A', 'G'],
-                'PolyPhen': [0.1, 0.01],
-                'Sift': [0.1, 0.01],
-                'Other_feature': ['foo', 'bar']
+                'chr': [1, 2, 3],
+                'pos': [1, 2, 3],
+                'REF': ['A', 'AT', 'ATCG'],
+                'ALT': ['C', 'G', 'ATGCAB'],
+                'REFSEQ_MATCH': ['foo', 'bar', 'baz'],  # Included because of REF, can cause issues
+                'ALTERNATIVE_FEATURE': ['foo', 'bar', 'baz'],
+                'feature_1': ['foo_bar', 'bar', 'baz']
             }
         )
-        processed_data = self.main.process(loaded_dataset)
-        with open(self.main.json_path, 'rt') as fh:
-            features = json.load(fh).keys()
-        self.main._get_processed_features(processed_data, features)
+        user_input = ['REF', 'ALT', 'Type', 'Length', 'feature_1']
+        processed_data, vep_processed = self.main.process(test_case, user_input)
+        self.assertIn(
+            'REF',
+            vep_processed
+        )
         self.assertSetEqual(
-            {'ref', 'alt', 'Length', 'Type', 'PolyPhenVal', 'PolyPhenCat'},
-            set(self.main.processed_features)
+            set(vep_processed['REF']),
+            {'Type', 'Length'}
+        )
+        processable_features = self.main._reset_processing_features(
+            user_input, vep_processed, processed_data.columns
+        )
+        self.assertSetEqual(
+            set(processable_features),
+            {'REF', 'ALT', 'Type', 'Length', 'feature_1'}
+        )
+        fully_processed_data, processed_features = self.main.categorical_process(
+            processed_data, train_features=processable_features, processing_features=None
+        )
+        # Test to see if REF is successfully processed
+        self.assertIn(
+            'REF_A',
+            fully_processed_data.columns
+        )
+        # Test to see if REFSEQ is successfully skipped
+        self.assertNotIn(
+            'REFSEQ_MATCH_foo',
+            fully_processed_data.columns
+        )
+        # Another test to see if feature_1 is successfully processed
+        self.assertIn(
+            'feature_1_foo_bar',
+            fully_processed_data.columns
+        )
+        # Test to see if category A is successfully saved for REF
+        self.assertIn(
+            'A',
+            processed_features['REF']
+        )
+
+        self.main._set_train_features(
+            processable_features, processed_features
+        )
+
+        # Test to see if REF_A is successfully inserted into the final training features
+        self.assertIn(
+            'REF_A',
+            self.main.train_features
+        )
+
+        # Test to see if REFSEQ_MATCH_foo is successfully skipped
+        self.assertNotIn(
+            'REFSEQ_MATCH_foo',
+            self.main.train_features
+        )
+        # Test to see if multiple underscores also get successfully inserted
+        self.assertIn(
+            'feature_1_foo_bar',
+            self.main.train_features
+        )
+        self.assertSetEqual(
+            set(self.main.train_features),
+            {
+                'REF_A', 'REF_AT', 'REF_ATCG',
+                'ALT_C', 'ALT_G', 'ALT_ATGCAB',
+                'Type_DELINS', 'Type_SNV',
+                'Length',
+                'feature_1_foo_bar', 'feature_1_bar', 'feature_1_baz'
+            }
         )
 
 
diff --git a/tests/capice/utilities/test_categorical_processor.py b/tests/capice/utilities/test_categorical_processor.py
new file mode 100644
index 00000000..baa89410
--- /dev/null
+++ b/tests/capice/utilities/test_categorical_processor.py
@@ -0,0 +1,231 @@
+import unittest
+
+import numpy as np
+import pandas as pd
+
+from molgenis.capice.utilities.enums import Column
+from tests.capice.test_templates import set_up_impute_preprocess, teardown
+from molgenis.capice.utilities.categorical_processor import CategoricalProcessor
+
+
+class TestCategoricalProcessor(unittest.TestCase):
+    @classmethod
+    def setUp(cls):
+        cls.preprocessor = CategoricalProcessor()
+        cls.chr_pos_ref_alt_testcase = pd.DataFrame(
+            {
+                'chr': [1, 2, 3, 4, 5],
+                'pos': [1, 2, 3, 4, 5],
+                'REF': [1, 2, 3, 4, 5],
+                'ALT': [1, 2, 3, 4, 5]
+            }
+        )
+        cls.main, cls.model = set_up_impute_preprocess()
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        teardown()
+
+    @staticmethod
+    def creat_other_column(value: str) -> str:
+        return '_'.join([value, Column.other.value])
+
+    def test_unit_preprocessing_file(self):
+        """
+        Unit test for the preprocessor to see if the preprocessor works just
+        the file header information.
+        """
+        print('Preprocessing (unit) (file)')
+        self.main.categorical_process(
+            loaded_data=self.main.process(
+                self.main._load_file(), process_features=self.model.vep_features.keys()
+            )[0], processing_features=self.model.processable_features
+        )
+
+    def test_component_preprocessing(self):
+        """
+        component test for preprocessing. All columns within the CADD
+        features should be processed. Furthermore,
+        within all processed columns,
+        there should not be 1 or more column that is still
+        considered categorical.
+        """
+        print('Preprocessing (component)')
+        processed_file = self.main.categorical_process(
+            loaded_data=self.main.process(
+                self.main._load_file(), process_features=self.model.vep_features.keys()
+            )[0], processing_features=self.model.processable_features
+        )[0]
+        model_features = self.model.get_booster().feature_names
+        processed_columns = processed_file.columns
+        for feature in model_features:
+            # Check if all model features are present before predicting
+            self.assertIn(feature, processed_columns)
+        # Check if none of the processed columns can be marked as categorical
+        self.assertEqual(
+            len(processed_file[model_features].select_dtypes(include=["O"]).columns),
+            0
+        )
+
+    def test_preprocessing_train(self):
+        """
+        Component test for the preprocessing part with train=True.
+        """
+        data = pd.DataFrame(
+            {
+                'foo': ['a', 'b', 'c', np.nan, np.nan, np.nan],
+                'bar': ['a', np.nan, np.nan, np.nan, np.nan, np.nan],
+                'baz': ['a', 'b', 'c', 'd', 'e', 'f'],
+                'feature_1': [1, 2, 3, 4, np.nan, np.nan],
+                'feature_excluded': [1, 2, 3, 4, np.nan, np.nan],
+                'chr': [1, 2, 3, 4, 5, 6],
+                'pos': [100, 200, 300, 400, 500, 600],
+                'REF': ['A', 'T', 'A', 'T', 'A', 'T'],
+                'ALT': ['G', 'C', 'G', 'C', 'G', 'C']
+            }
+        )
+        user_input_features = ['foo', 'bar', 'baz', 'feature_1']
+        processor = CategoricalProcessor()
+        observed = processor.process(data, processable_features=user_input_features)[0]
+        expected = pd.DataFrame(
+            {
+                'foo_a': [1, 0, 0, 0, 0, 0],
+                'foo_b': [0, 1, 0, 0, 0, 0],
+                'foo_c': [0, 0, 1, 0, 0, 0],
+                self.creat_other_column('foo'): [0, 0, 0, 1, 1, 1],
+                'bar_a': [1, 0, 0, 0, 0, 0],
+                self.creat_other_column('bar'): [0, 1, 1, 1, 1, 1],
+                'baz_a': [1, 0, 0, 0, 0, 0],
+                'baz_b': [0, 1, 0, 0, 0, 0],
+                'baz_c': [0, 0, 1, 0, 0, 0],
+                'baz_d': [0, 0, 0, 1, 0, 0],
+                'baz_e': [0, 0, 0, 0, 1, 0],
+                self.creat_other_column('baz'): [0, 0, 0, 0, 0, 1],
+                'REF': ['A', 'T', 'A', 'T', 'A', 'T'],
+                'ALT': ['G', 'C', 'G', 'C', 'G', 'C'],
+                'feature_1': [1, 2, 3, 4, np.nan, np.nan],
+                'feature_excluded': [1, 2, 3, 4, np.nan, np.nan],
+                'chr': [1, 2, 3, 4, 5, 6],
+                'pos': [100, 200, 300, 400, 500, 600],
+                'chr_pos_ref_alt': [
+                    '1_VeryUniqueCAPICESeparator_100_VeryUniqueCAPICESeparator_'
+                    'A_VeryUniqueCAPICESeparator_G',
+                    '2_VeryUniqueCAPICESeparator_200_VeryUniqueCAPICESeparator_'
+                    'T_VeryUniqueCAPICESeparator_C',
+                    '3_VeryUniqueCAPICESeparator_300_VeryUniqueCAPICESeparator_'
+                    'A_VeryUniqueCAPICESeparator_G',
+                    '4_VeryUniqueCAPICESeparator_400_VeryUniqueCAPICESeparator_'
+                    'T_VeryUniqueCAPICESeparator_C',
+                    '5_VeryUniqueCAPICESeparator_500_VeryUniqueCAPICESeparator_'
+                    'A_VeryUniqueCAPICESeparator_G',
+                    '6_VeryUniqueCAPICESeparator_600_VeryUniqueCAPICESeparator_'
+                    'T_VeryUniqueCAPICESeparator_C',
+                ]
+            }
+        )
+        pd.testing.assert_frame_equal(
+            observed.sort_index(axis=1), expected.sort_index(axis=1), check_dtype=False
+        )
+
+    def test_creation_other(self):
+        test_case = pd.concat(
+            [
+                self.chr_pos_ref_alt_testcase,
+                pd.DataFrame(
+                    {
+                        'foo': ['bar', 'baz', 'barz', 'foobar', 'foobaz', 'last']
+                    }
+                )
+            ], axis=1
+        )
+        observed_df, observed_dict = self.preprocessor.process(test_case, processable_features=[
+            'foo'])
+        self.assertIn(
+            'foo',
+            observed_dict.keys()
+        )
+        self.assertIn(
+            Column.other.value,
+            observed_dict['foo']
+        )
+        self.assertIn(self.creat_other_column('foo'), observed_df.columns)
+
+    def test_creation_other_notin(self):
+        test_case = pd.concat(
+            [
+                self.chr_pos_ref_alt_testcase,
+                pd.DataFrame(
+                    {
+                        'foo': ['bar', 'baz', 'barz', 'foobar', 'foobaz']
+                    }
+                )
+            ], axis=1
+        )
+        observed_df, observed_dict = self.preprocessor.process(test_case, processable_features=[
+            'foo'])
+        self.assertIn(
+            'foo',
+            observed_dict.keys()
+        )
+        self.assertNotIn(
+            Column.other.value,
+            observed_dict['foo']
+        )
+        self.assertNotIn(self.creat_other_column('foo'), observed_df.columns)
+
+    def test_other_in_top_5(self):
+        # Tests that, if "other" occurs in the top 5 categories, only this "other" feature gets
+        # sampled and no other sample get sampled into the "other" category.
+        test_case = pd.concat(
+            [
+                self.chr_pos_ref_alt_testcase,
+                pd.DataFrame(
+                    {
+                        'foo': ['other', 'other', 'foo', 'bar', 'baz', 'foobar', 'foobaz']
+                    }
+                )
+            ], axis=1
+        )
+        observed_df, observed_dict = self.preprocessor.process(test_case, processable_features=[
+            'foo'])
+        test_series = observed_df['foo_other']
+        self.assertFalse(test_series[test_series > 0].size > 2,
+                         msg=f'Actual size: {test_series[test_series > 0].size}')
+        self.assertIn(
+            self.creat_other_column('foo'),
+            observed_df.columns
+        )
+
+    def test__create_preservation_col(self):
+        input_data_frame = pd.DataFrame(
+            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'REF': ['A', 'T', 'C'],
+             'ALT': ['G', 'A', 'T']})
+        expected_output = pd.DataFrame(
+            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'REF': ['A', 'T', 'C'],
+             'ALT': ['G', 'A', 'T'],
+             'chr_pos_ref_alt':
+                 ['1_VeryUniqueCAPICESeparator_123_VeryUniqueCAPICESeparator_'
+                  'A_VeryUniqueCAPICESeparator_G',
+                  '2_VeryUniqueCAPICESeparator_456_VeryUniqueCAPICESeparator_'
+                  'T_VeryUniqueCAPICESeparator_A',
+                  '4_VeryUniqueCAPICESeparator_789_VeryUniqueCAPICESeparator_'
+                  'C_VeryUniqueCAPICESeparator_T']
+             }
+        )
+        self.preprocessor._create_preservation_col(input_data_frame)
+
+        pd.testing.assert_frame_equal(expected_output, input_data_frame)
+
+    def test__get_categorical_columns(self):
+        preprocessor = CategoricalProcessor()
+        input_data_frame = pd.DataFrame(
+            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'REF': ['A', 'T', 'C'],
+             'ALT': ['G', 'A', 'T']})
+        features = preprocessor._get_categorical_columns(input_data_frame, processable_features=[
+            'REF', 'ALT'])
+        self.assertIn('REF', features.keys())
+        self.assertIn('ALT', features.keys())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/capice/utilities/test_file_postprocessor.py b/tests/capice/utilities/test_file_postprocessor.py
index 257f9d73..0c8698ed 100644
--- a/tests/capice/utilities/test_file_postprocessor.py
+++ b/tests/capice/utilities/test_file_postprocessor.py
@@ -45,8 +45,8 @@ def test_load_file_pre_processor(self):
             {
                 "chr": [1, 2, 3],
                 "pos": [100, 200, 300],
-                "ref": ['A', 'T', 'G'],
-                "alt": ['T', 'G', 'A'],
+                "REF": ['A', 'T', 'G'],
+                "ALT": ['T', 'G', 'A'],
                 "id_source": ['foo', 'foo', 'bar'],
                 "feature": ['bar', 'bar', 'buz'],
                 "gene_name": ['g1', 'g2', 'g3'],
diff --git a/tests/capice/utilities/test_load_file_postprocessor.py b/tests/capice/utilities/test_load_file_postprocessor.py
index c0af6215..76558199 100644
--- a/tests/capice/utilities/test_load_file_postprocessor.py
+++ b/tests/capice/utilities/test_load_file_postprocessor.py
@@ -32,8 +32,8 @@ def test_process(self):
             {
                 'chr': [1],
                 'pos': [123],
-                'ref': ['A'],
-                'alt': ['G'],
+                'REF': ['A'],
+                'ALT': ['G'],
                 'gene_id': [123],
                 'id_source': ['hgnc'],
                 'feature': ['NM1.123'],
diff --git a/tests/capice/utilities/test_manual_vep_processor.py b/tests/capice/utilities/test_manual_vep_processor.py
index 31756a49..461bc421 100644
--- a/tests/capice/utilities/test_manual_vep_processor.py
+++ b/tests/capice/utilities/test_manual_vep_processor.py
@@ -17,8 +17,8 @@ def setUpClass(cls) -> None:
             {
                 'chr': {0: '1', 1: '1'},
                 'pos': {0: 1, 1: 10042538},
-                'ref': {0: 'C', 1: 'C'},
-                'alt': {0: 'T', 1: 'T'},
+                'REF': {0: 'C', 1: 'C'},
+                'ALT': {0: 'T', 1: 'T'},
                 'Consequence': {0: 'missense_variant', 1: 'downstream_gene_variant'},
                 'gene_name': {0: 'NMNAT1', 1: 'NMNAT1'},
                 'SourceID': {0: 'HGNC', 1: 'HGNC'},
@@ -36,6 +36,8 @@ def setUpClass(cls) -> None:
             }
         )
         cls.annotator = ManualVEPProcessor()
+        cls.user_input_features = ['REF', 'PolyPhen', 'SIFT', 'Consequence', 'cDNA_position',
+                                   'CDS_position', 'Protein_position', 'Amino_acids']
 
     def setUp(self) -> None:
         print('Testing case:')
@@ -99,8 +101,8 @@ def test_component_annotator(self):
                     [
                         'chr',
                         'pos',
-                        'ref',
-                        'alt',
+                        'REF',
+                        'ALT',
                         'gene_name',
                         'SourceID',
                         'HGNC_ID',
@@ -113,7 +115,7 @@ def test_component_annotator(self):
                 expected_processed_columns
             ], axis=1
         )
-        outcome = self.annotator.process(self.dataset)
+        outcome = self.annotator.process(self.dataset, self.user_input_features)
         # if numpy.array dtype not given,
         # then the type will be determined as the minimum type required to hold the
         # objects in the sequence. this minimal type is system dependent.
@@ -128,8 +130,8 @@ def test_bug_attributeerror_template_sift_polyphen(self):
             {
                 'chr': ['1', '2'],
                 'pos': [100, 200],
-                'ref': ['A', 'GCC'],
-                'alt': ['C', 'C'],
+                'REF': ['A', 'GCC'],
+                'ALT': ['C', 'C'],
                 'SIFT': [np.nan, np.nan],
                 'PolyPhen': [np.nan, np.nan]
             }
@@ -139,8 +141,8 @@ def test_bug_attributeerror_template_sift_polyphen(self):
             {
                 'chr': ['1', '2'],
                 'pos': [100, 200],
-                'ref': ['A', 'GCC'],
-                'alt': ['C', 'C'],
+                'REF': ['A', 'GCC'],
+                'ALT': ['C', 'C'],
                 'SIFTcat': [np.nan, np.nan],
                 'SIFTval': [np.nan, np.nan],
                 'PolyPhenCat': [np.nan, np.nan],
@@ -148,10 +150,44 @@ def test_bug_attributeerror_template_sift_polyphen(self):
             }
         )
         annotator = ManualVEPProcessor()
-        out_dataframe = annotator.process(bugged_dataframe)
+        out_dataframe = annotator.process(bugged_dataframe, self.user_input_features)
         # Testing for expected dataframe columns, since it processes more.
         pd.testing.assert_frame_equal(expected_dataframe, out_dataframe[expected_dataframe.columns])
 
+    @staticmethod
+    def prepare_getter_tests():
+        data = pd.DataFrame(
+            {
+                'REF': ['A', 'C'],
+                'ALT': ['T', 'G'],
+                'PolyPhen': [0.08, 0.98]
+            }
+        )
+        user_input = ['REF', 'PolyPhen']
+        annotator = ManualVEPProcessor()
+        annotator.process(data, user_input)
+        return annotator
+
+    def test_getter_vep_input(self):
+        data = pd.DataFrame(
+            {
+                'REF': ['A', 'C'],
+                'ALT': ['T', 'G'],
+                'PolyPhen': [0.08, 0.98]
+            }
+        )
+        user_input = ['REF', 'PolyPhen']
+        annotator = ManualVEPProcessor()
+        annotator.process(data, user_input)
+        observed = annotator.get_feature_processes()
+        expected_keys = ['REF', 'PolyPhen']
+        expected_values = ['Type', 'Length', 'PolyPhenCat', 'PolyPhenVal']
+        for input_feature in observed.keys():
+            self.assertIn(input_feature, expected_keys)
+        for output_features in observed.values():
+            for feature in output_features:
+                self.assertIn(feature, expected_values)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/capice/utilities/test_predict.py b/tests/capice/utilities/test_predict.py
index a9262dce..304b9efb 100644
--- a/tests/capice/utilities/test_predict.py
+++ b/tests/capice/utilities/test_predict.py
@@ -24,11 +24,11 @@ def test_unit_prediction(self):
         """
         print('Prediction (unit)')
         self.main.predict(
-            self.main.preprocess(
+            self.main.categorical_process(
                 self.main.process(
-                    self.main._load_file()
-                ), model_features=self.model.get_booster().feature_names
-            )
+                    self.main._load_file(), process_features=self.model.vep_features.keys()
+                )[0], processing_features=self.model.processable_features
+            )[0]
         )
 
     def test_component_prediction(self):
@@ -38,11 +38,11 @@ def test_component_prediction(self):
         """
         print('Prediction (component)')
         prediction = self.main.predict(
-            self.main.preprocess(
+            self.main.categorical_process(
                 self.main.process(
-                    self.main._load_file()
-                ), model_features=self.model.get_booster().feature_names
-            )
+                    self.main._load_file(), process_features=self.model.vep_features.keys()
+                )[0], processing_features=self.model.processable_features
+            )[0]
         )
         # Combined sum of the prediction score should be higher than 0
         self.assertGreater(prediction[Column.score.value].sum(), 0)
diff --git a/tests/capice/utilities/test_predictor.py b/tests/capice/utilities/test_predictor.py
index 8b8d77e7..3958c637 100644
--- a/tests/capice/utilities/test_predictor.py
+++ b/tests/capice/utilities/test_predictor.py
@@ -10,11 +10,11 @@ def setUpClass(cls):
         print('Setting up.')
         main, model = set_up_impute_preprocess()
         cls.predictor = Predictor(model)
-        cls.dataset = main.preprocess(
+        cls.dataset = main.categorical_process(
             main.process(
-                main._load_file()
-            ), model_features=model.get_booster().feature_names
-        )
+                main._load_file(), process_features=model.vep_features.keys()
+            )[0], processing_features=model.processable_features
+        )[0]
 
     def test_predict(self):
         observed = self.predictor.predict(self.dataset)
diff --git a/tests/capice/utilities/test_preprocessing.py b/tests/capice/utilities/test_preprocessing.py
deleted file mode 100644
index 2c1f38c6..00000000
--- a/tests/capice/utilities/test_preprocessing.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import unittest
-
-from tests.capice.test_templates import set_up_impute_preprocess, teardown
-
-
-class TestPreprocessing(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        print('Setting up.')
-        cls.main, cls.model = set_up_impute_preprocess()
-
-    @classmethod
-    def tearDownClass(cls):
-        print('Tearing down.')
-        teardown()
-
-    def setUp(self):
-        print('Testing case:')
-
-    def test_unit_preprocessing_file(self):
-        """
-        Unit test for the preprocessor to see if the preprocessor works just
-        the file header information.
-        """
-        print('Preprocessing (unit) (file)')
-        self.main.preprocess(
-            loaded_data=self.main.process(
-                    self.main._load_file()
-                ), model_features=self.model.get_booster().feature_names
-        )
-
-    def test_component_preprocessing(self):
-        """
-        component test for preprocessing. All columns within the CADD
-        features should be processed. Furthermore,
-        within all processed columns,
-        there should not be 1 or more column that is still
-        considered categorical.
-        """
-        print('Preprocessing (component)')
-        processed_file = self.main.preprocess(
-            self.main.process(
-                self.main._load_file()
-            ), model_features=self.model.get_booster().feature_names
-        )
-        model_features = self.model.get_booster().feature_names
-        processed_columns = processed_file.columns
-        for feature in model_features:
-            # Check if all model features are present before predicting
-            self.assertIn(feature, processed_columns)
-        # Check if none of the processed columns can be marked as categorical
-        self.assertEqual(
-            len(processed_file[model_features].select_dtypes(include=["O"]).columns),
-            0
-        )
-
-    def test_component_preprocessing_train(self):
-        """
-        Component test for the preprocessing part with train=True.
-        """
-        print('Preprocessing (train) (component)')
-        preprocessed_file = self.main.preprocess(
-            self.main.process(
-                self.main._load_file()
-            )
-        )
-
-        # Test if all columns matching,
-        # or starting with features within the imputing
-        # file are not classified objects.
-        impute_features = self.model.model_features
-        processed_columns = preprocessed_file.columns
-        present_features = 1
-        # Should be one, since the for loop quits before
-        # it can finish the last add_one
-        test_features = []
-        add_one = False
-        for feature in impute_features:
-            if add_one:
-                present_features += 1
-            add_one = False
-            for processed_feature in processed_columns:
-                if processed_feature.startswith(feature):
-                    add_one = True
-                    test_features.append(processed_feature)
-        # Test if all impute features are present
-        self.assertEqual(len(impute_features), present_features)
-        # Test if no columns are still objects.
-        self.assertEqual(
-            len(preprocessed_file[test_features].select_dtypes(include=["O"]).columns),
-            0
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/capice/utilities/test_preprocessor.py b/tests/capice/utilities/test_preprocessor.py
deleted file mode 100644
index bdbfa92d..00000000
--- a/tests/capice/utilities/test_preprocessor.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import unittest
-
-import pandas as pd
-
-from molgenis.capice.utilities.preprocessor import PreProcessor
-
-
-def get_uint8_array(values_list):
-    return pd.array(values_list, dtype='uint8')
-
-
-class TestPreprocessor(unittest.TestCase):
-    @classmethod
-    def setUp(cls):
-        print('Setting up.')
-        cls.preprocessor = PreProcessor([])
-
-    def test__create_preservation_col(self):
-        input_data_frame = pd.DataFrame(
-            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'],
-             'alt': ['G', 'A', 'T']})
-        expected_output = pd.DataFrame(
-            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'],
-             'alt': ['G', 'A', 'T'],
-             'chr_pos_ref_alt':
-                 ['1_VeryUniqueCAPICESeparator_123_VeryUniqueCAPICESeparator_'
-                  'A_VeryUniqueCAPICESeparator_G',
-                  '2_VeryUniqueCAPICESeparator_456_VeryUniqueCAPICESeparator_'
-                  'T_VeryUniqueCAPICESeparator_A',
-                  '4_VeryUniqueCAPICESeparator_789_VeryUniqueCAPICESeparator_'
-                  'C_VeryUniqueCAPICESeparator_T']
-             }
-        )
-        actual_output = self.preprocessor._create_preservation_col(input_data_frame)
-
-        pd.testing.assert_frame_equal(expected_output, actual_output)
-
-    def test__is_train(self):
-        self.assertEqual(False, self.preprocessor.train)
-        self.preprocessor._is_train()
-        self.assertEqual(True, self.preprocessor.train)
-
-    def test__get_categorical_columns(self):
-        input_data_frame = pd.DataFrame(
-            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'],
-             'alt': ['G', 'A', 'T']})
-        self.preprocessor._get_categorical_columns(input_data_frame)
-        self.assertEqual(['ref', 'alt'], self.preprocessor.objects)
-
-    def test__process_objects_train_false(self):
-        self.preprocessor.objects = ['ref', 'alt', 'blaat']
-        self.preprocessor.model_features = ['blaat_something']
-        input_data_frame = pd.DataFrame(
-            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'],
-             'alt': ['G', 'A', 'T'], 'blaat': ['some', 'value', 'something']})
-
-        expected = pd.DataFrame(
-            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'],
-             'alt': ['G', 'A', 'T'],
-             'blaat_other': get_uint8_array([1, 1, 0]),
-             'blaat_something': get_uint8_array([0, 0, 1])
-             })
-        observed = self.preprocessor._process_objects(input_data_frame)
-        pd.testing.assert_frame_equal(expected, observed)
-
-    def test__process_objects_train_true(self):
-        self.preprocessor.train = True
-        self.preprocessor.objects = ['ref', 'alt', 'blaat']
-        self.preprocessor.model_features = ['blaat_something']
-        input_data_frame = pd.DataFrame(
-            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref': ['A', 'T', 'C'],
-             'alt': ['G', 'A', 'T'], 'blaat': ['some', 'value', 'here']})
-        expected = pd.DataFrame(
-            {'chr': [1, 2, 4], 'pos': [123, 456, 789], 'ref_A': get_uint8_array([1, 0, 0]),
-             'ref_C': get_uint8_array([0, 0, 1]), 'ref_T': get_uint8_array([0, 1, 0]),
-             'alt_A': get_uint8_array([0, 1, 0]), 'alt_G': get_uint8_array([1, 0, 0]),
-             'alt_T': get_uint8_array([0, 0, 1]), 'blaat_here': get_uint8_array([0, 0, 1]),
-             'blaat_some': get_uint8_array([1, 0, 0]),
-             'blaat_value': get_uint8_array([0, 1, 0])})
-        observed = self.preprocessor._process_objects(input_data_frame)
-        pd.testing.assert_frame_equal(expected, observed)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/capice/validators/test_post_file_parse_validator.py b/tests/capice/validators/test_post_file_parse_validator.py
index adecb669..ae3e57b7 100644
--- a/tests/capice/validators/test_post_file_parse_validator.py
+++ b/tests/capice/validators/test_post_file_parse_validator.py
@@ -14,8 +14,8 @@ def setUpClass(cls) -> None:
             {
                 'chr': [1, 2],
                 'pos': [100, 200],
-                'ref': ['A', 'A'],
-                'alt': ['T', 'T'],
+                'REF': ['A', 'A'],
+                'ALT': ['T', 'T'],
                 'feat1': ['foo', 'bar']
             }
         )
@@ -62,7 +62,7 @@ def test_validation_incorrect_required_columns_preset_required(self):
         self.assertRaises(
             KeyError,
             self.validator.validate_minimally_required_columns,
-            self.dataset.drop(columns='ref'),
+            self.dataset.drop(columns='REF'),
             additional_required_features='feat1'
         )
 
diff --git a/tests/capice/validators/test_post_vep_processing_validator.py b/tests/capice/validators/test_post_vep_processing_validator.py
index 06b05aaf..44d8fb8b 100644
--- a/tests/capice/validators/test_post_vep_processing_validator.py
+++ b/tests/capice/validators/test_post_vep_processing_validator.py
@@ -14,14 +14,13 @@ def setUpClass(cls) -> None:
             {
                 'chr': [1, 2],
                 'pos': [100, 200],
-                'ref': ['A', 'A'],
-                'alt': ['T', 'T'],
+                'REF': ['A', 'A'],
+                'ALT': ['T', 'T'],
                 'feat1': ['foo', 'bar']
             }
         )
-        cls.validator = PostVEPProcessingValidator(
-            load_model(ResourceFile.XGB_BOOSTER_POC_UBJ.value)
-        )
+        cls.validator = PostVEPProcessingValidator()
+        cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_UBJ.value)
 
     @classmethod
     def tearDownClass(cls) -> None:
@@ -33,7 +32,8 @@ def test_validate_features_present_incorrect(self):
         self.assertRaises(
             KeyError,
             self.validator.validate_features_present,
-            self.dataset
+            self.dataset,
+            self.model.vep_features.values()
         )
 
 
diff --git a/tests/capice/vep/test_amino_acids.py b/tests/capice/vep/test_amino_acids.py
index b7b3ba53..49f7b989 100644
--- a/tests/capice/vep/test_amino_acids.py
+++ b/tests/capice/vep/test_amino_acids.py
@@ -19,6 +19,14 @@ def test_process(self):
                                  'nAA': ['G', 'C', 'C']})
         pd.testing.assert_frame_equal(expected, observed)
 
+    def test_process_no_alt(self):
+        dataframe = pd.DataFrame({'Amino_acids': ['A', 'R', 'G']})
+        observed = self.aa.process(dataframe)
+        expected = pd.DataFrame({'Amino_acids': ['A', 'R', 'G'],
+                                 'oAA': ['A', 'R', 'G'],
+                                 'nAA': ['A', 'R', 'G']})
+        pd.testing.assert_frame_equal(expected, observed)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/capice/vep/test_consequence.py b/tests/capice/vep/test_consequence.py
index 05c72719..ea7a7b8e 100644
--- a/tests/capice/vep/test_consequence.py
+++ b/tests/capice/vep/test_consequence.py
@@ -73,6 +73,25 @@ def test_consequence(self):
         pd.testing.assert_frame_equal(observerd.sort_index(axis=1), expected.sort_index(
             axis=1), check_dtype=False)
 
+    def test_non_coding(self):
+        data = pd.DataFrame({
+            'variants': ['variant_1', 'variant_2', 'variant_3'],
+            'Consequence': [np.nan, np.nan, np.nan]
+        })
+        columns = data.columns
+        expected_altered = self.expected_data.copy(deep=True)
+        # Easier to locate the ones in self.expected_data than to hardcode a new one
+        expected_altered.loc[1, 'is_start_lost'] = 0
+        expected_altered.loc[0, 'is_stop_lost'] = 0
+        expected_altered.loc[0, 'is_transcript_ablation'] = 0
+        expected = pd.concat([data, expected_altered], axis=1)
+        observed = Consequence().process(data)
+        self.assertFalse(observed[observed.columns.difference(columns)].isnull().values.any())
+        pd.testing.assert_frame_equal(
+            observed.sort_index(axis=1),
+            expected.sort_index(axis=1)
+        )
+
     def test_consequence_warning(self):
         """
         Tests that when a consequence is encountered that is not present within the processor
diff --git a/tests/capice/vep/test_length.py b/tests/capice/vep/test_length.py
index f41fc484..f10c6437 100644
--- a/tests/capice/vep/test_length.py
+++ b/tests/capice/vep/test_length.py
@@ -13,12 +13,12 @@ def setUpClass(cls):
 
     def test_process(self):
         dataframe = pd.DataFrame({
-            'ref': ['ATAG', 'A', 'C', 'AC'],
-            'alt': ['A', 'ATG', 'A', 'GT']})
+            'REF': ['ATAG', 'A', 'C', 'AC'],
+            'ALT': ['A', 'ATG', 'A', 'GT']})
         observed = self.length.process(dataframe)
         expected = pd.DataFrame({
-            'ref': ['ATAG', 'A', 'C', 'AC'],
-            'alt': ['A', 'ATG', 'A', 'GT'],
+            'REF': ['ATAG', 'A', 'C', 'AC'],
+            'ALT': ['A', 'ATG', 'A', 'GT'],
             'Length': [3, 2, 0, 0]})
         pd.testing.assert_frame_equal(expected, observed)
 
diff --git a/tests/capice/vep/test_type.py b/tests/capice/vep/test_type.py
index cf8cccc2..e7aaa74c 100644
--- a/tests/capice/vep/test_type.py
+++ b/tests/capice/vep/test_type.py
@@ -12,12 +12,12 @@ def setUpClass(cls):
         cls.type = type.Type()
 
     def test_process(self):
-        input_data_frame = pd.DataFrame({'ref': ['C', 'CA', 'CA', 'C', 'CA', 'CA'],
-                                         'alt': ['G', 'GCC', 'GG', 'CG', 'G', 'C']})
+        input_data_frame = pd.DataFrame({'REF': ['C', 'CA', 'CA', 'C', 'CA', 'CA'],
+                                         'ALT': ['G', 'GCC', 'GG', 'CG', 'G', 'C']})
         actual_output = self.type.process(input_data_frame)
         expected_output = pd.DataFrame({
-            'ref': ['C', 'CA', 'CA', 'C', 'CA', 'CA'],
-            'alt': ['G', 'GCC', 'GG', 'CG', 'G', 'C'],
+            'REF': ['C', 'CA', 'CA', 'C', 'CA', 'CA'],
+            'ALT': ['G', 'GCC', 'GG', 'CG', 'G', 'C'],
             'Type': ['SNV', 'DELINS', 'DELINS', 'INS', 'DELINS', 'DEL']})
         pd.testing.assert_frame_equal(actual_output, expected_output)
 
diff --git a/tests/resources/xgb_booster_poc.ubj b/tests/resources/xgb_booster_poc.ubj
index d82d8d63..324341a2 100644
Binary files a/tests/resources/xgb_booster_poc.ubj and b/tests/resources/xgb_booster_poc.ubj differ