Merge pull request #35 from SietsmaRJ/master

Exposed transcript identifier in CAPICE output
molgenis · May 28, 2021 · 0e3bbac · 0e3bbac
2 parents b520d41 + 2469fd6
commit 0e3bbac
Show file tree

Hide file tree

Showing 8 changed files with 28 additions and 61 deletions.
diff --git a/README.md b/README.md
@@ -33,9 +33,9 @@ The following sections will guide you through the steps needed for the variant a
 making predictions using the CAPICE model.
 
 ### Download and installation (UNIX like systems)
-__Note: this install is for Python 3.7 and Python 3.8. 
+__Note: this install is for Python 3.7, Python 3.8 and Python 3.9. 
 Python 3.6 is also supported and install can be found at the bottom of this chapter.
-Python 3.5 and lower or Python 3.9 and higher is not supported (yet).__
+Python 3.5 and lower is not supported.__
 
 1. Software and libraries
 CAPICE scripts can be downloaded from the CAPICE github repository.
@@ -95,7 +95,18 @@ CAPICE requires the following arguments:
 - -i / --input: The path to the input [CADD annotated](https://cadd.gs.washington.edu/) dataset using the tab separator (can be both gzipped or not). An example of an input TSV file can be found in `CAPICE_example/test_cadd14_grch37_annotated.tsv.gz` for CADD 1.4 and genome build 37.
 
 The following flags are optional:
-- -o / --output: The path to the directory, output filename or output directory and filename where the output is placed (will be made if it does not exists). If only a filename is supplied, or no output is supplied, the file will be placed within the input directory. __The file will always be gzipped!__
+- -o / --output: The path to the directory, output filename or output directory and filename where the output is placed (will be made if it does not exists). If only a filename is supplied, or no output is supplied, the file will be placed within the input directory. __The file will always be gzipped with a .gz extension!__
+
+_For instance:_
+
+`-i input.txt` becomes `input_capice.txt.gz`
+
+`-i input.txt -o output.txt` becomes `output.txt.gz`
+
+`-i input.txt -o path/to/output.txt` becomes `path/to/output.txt.gz`
+
+`-i input.txt -o path/to/output` becomes `path/to/output/input_capice.txt.gz`
+
 - -v / --verbose: Display more in depth messages within the progress of CAPICE.
 - -f / --force: Overwrite an output file if already present (does NOT work for logfiles).
 - --train: Activates the 'train new CAPICE-like models' within CAPICE.
@@ -108,7 +119,6 @@ A file will be put out containing the following columns:
 
 - __No__ index
 - chr_pos_ref_alt: column containing the chromosome, position, reference and alternative separated by an underscore.
-- ID: Column full of `.`.
 - GeneName: The ENSEMBL gene name of the variant as supplied by CADD.
 - FeatureID: The ENSEMBL feature ID (Transcript ID or regulatory feature ID).
 - Consequence: The type of consequence that the variant has as supplied by CADD.

diff --git a/src/main/python/core/exporter.py b/src/main/python/core/exporter.py
@@ -5,7 +5,6 @@
 import os
 import pandas as pd
 import pickle
-import warnings
 
 
 class Exporter:
@@ -31,34 +30,9 @@ def export_capice_prediction(self, datafile: pd.DataFrame):
         :param datafile: prediction pandas DataFrame
         """
         filename = self._export_filename_ready(file_name=self.capice_filename, check_extension=False)
-        # datafile[self.export_cols].to_csv(filename, sep='\t', index=False)
-        datafile = self._export_legacy_prediction(datafile=datafile)
-        datafile.to_csv(filename, sep='\t', index=False)
+        datafile[self.export_cols].to_csv(filename, sep='\t', compression='gzip', index=False)
         self.log.info('Successfully exported CAPICE datafile to: {}'.format(filename))
 
-    def _export_legacy_prediction(self, datafile):
-        warnings.warn('Using legacy export function, deprecated in 2.1.', DeprecationWarning)
-        datafile = datafile[self.export_cols]
-
-        # Required to prevent the SettingWithCopyWarning, even when using:
-        # dataframe.loc[row_indexer,col_indexer] = value
-        pd.options.mode.chained_assignment = None
-
-        datafile.loc[:, Column.prediction.value] = 'empty'
-        datafile.loc[:, Column.combined_prediction.value] = 'empty'
-        datafile.loc[:, Column.PHRED.value] = 0.0
-        datafile.drop(columns=Column.FeatureID.value, inplace=True)
-        datafile = datafile[
-            [Column.chr_pos_ref_alt.value,
-             Column.GeneName.value,
-             Column.Consequence.value,
-             Column.PHRED.value,
-             Column.probabilities.value,
-             Column.prediction.value,
-             Column.combined_prediction.value]
-        ]
-        return datafile
-
     def export_capice_training_dataset(self, datafile: pd.DataFrame, name: str, feature: str):
         """
         Function specific to export a (splitted) dataset comming from the training pathway.

diff --git a/src/main/python/core/input_checker.py b/src/main/python/core/input_checker.py
@@ -55,7 +55,7 @@ def check_input_output_directories(self, input_path, output_path):
                 # Then I know it's an output filename
                 self.output_directory = os.path.dirname(input_path)
                 self.output_filename = output_path
-        # self._check_gzip_extension()
+        self._check_gzip_extension()
 
     def _create_capice_output_filename(self, input_path, output_path=None, append_capice=True, ispath=False):
         if output_path is None:

diff --git a/src/main/python/resources/enums/sections.py b/src/main/python/resources/enums/sections.py
@@ -14,6 +14,3 @@ class Column(Enum):
     FeatureID = 'FeatureID'
     Consequence = 'Consequence'
     probabilities = 'probabilities'
-    prediction = 'prediction'
-    combined_prediction = 'combined_prediction'
-    PHRED = 'PHRED'
diff --git a/src/main/python/resources/preprocessors/preprocessor.py b/src/main/python/resources/preprocessors/preprocessor.py
@@ -65,7 +65,7 @@ def _raise_no_module_found_error(self):
         Specialized function to be used into _load_preprocessors() and _load_correct_preprocessor() to be raised when
         no preprocessing files can be found.
         """
-        error_message = 'No usable python files are found within the imputing directory!'
+        error_message = 'No usable python files are found within the model directory!'
         self.log.critical(error_message)
         raise FileNotFoundError(error_message)
 

diff --git a/src/test/python/core/test_exporter.py b/src/test/python/core/test_exporter.py
@@ -19,17 +19,6 @@ def setUpClass(cls):
             Column.Consequence.value: ['Synonymous', 'Frame-shift'],
             Column.probabilities.value: [0.01, 0.998]
         })
-        cls.legacy_export_prediction = pd.DataFrame(
-            {
-                Column.chr_pos_ref_alt.value: ['1_100_A_C', '2_200_T_G'],
-                Column.GeneName.value: ['foo', 'bar'],
-                Column.Consequence.value: ['Synonymous', 'Frame-shift'],
-                Column.PHRED.value: [0.0, 0.0],
-                Column.probabilities.value: [0.01, 0.998],
-                Column.prediction.value: ['empty', 'empty'],
-                Column.combined_prediction.value: ['empty', 'empty']
-            }
-        )
         cls.export_dataset = pd.DataFrame(
             {
                 'chr': [1, 2],
@@ -53,13 +42,8 @@ def test_prediction_output(self):
         print('Prediction output')
         self.exporter.capice_filename = 'test_output.tsv'
         self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe)
-        exported_data = pd.read_csv(os.path.join(self.output_loc, 'test_output.tsv'), sep='\t')
-        pd.testing.assert_frame_equal(exported_data, self.legacy_export_prediction)
-
-    def test_legacy_conversion(self):
-        print('Legacy output conversion')
-        converted_legacy = self.exporter._export_legacy_prediction(datafile=self.prediction_output_dataframe)
-        pd.testing.assert_frame_equal(converted_legacy, self.legacy_export_prediction)
+        exported_data = pd.read_csv(os.path.join(self.output_loc, 'test_output.tsv'), compression='gzip', sep='\t')
+        pd.testing.assert_frame_equal(exported_data, self.prediction_output_dataframe)
 
     def test_dataset_export(self):
         print('Dataset export')
@@ -85,8 +69,10 @@ def test_exporter_force(self):
         self.exporter.force = True
         self.exporter.capice_filename = 'already_present_file.tsv'
         self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe)
-        forced_file = pd.read_csv(os.path.join(self.output_loc, 'already_present_file.tsv'), sep='\t')
-        pd.testing.assert_frame_equal(forced_file, self.legacy_export_prediction)
+        forced_file = pd.read_csv(
+            os.path.join(self.output_loc, 'already_present_file.tsv'), compression='gzip', sep='\t'
+        )
+        pd.testing.assert_frame_equal(forced_file, self.prediction_output_dataframe)
 
 
 if __name__ == '__main__':

diff --git a/src/test/python/core/test_input_checker.py b/src/test/python/core/test_input_checker.py
@@ -86,8 +86,7 @@ def test_input_output_conversion_case3(self):
         print('Input output conversion (input + output directory + filename)')
         test_input = os.path.join('.', 'CAPICE_example', 'test_cadd14_grch37_annotated.tsv.gz')
         test_output = os.path.join('.', 'test_output', 'test.txt')
-        # expected_output_filename = 'test.txt.gz'
-        expected_output_filename = 'test.txt'  # Legacy support, if legacy is disabled can be removed.
+        expected_output_filename = 'test.txt.gz'
         expected_output_directory = os.path.join('.', 'test_output')
         self.input_checker.check_input_output_directories(input_path=test_input, output_path=test_output)
         self.assertEqual(self.input_checker.get_output_filename(), expected_output_filename)
@@ -97,9 +96,10 @@ def test_input_output_conversion_case4(self):
         print('Input output conversion (input + filename)')
         test_input = os.path.join('.', 'CAPICE_example', 'test_cadd14_grch37_annotated.tsv.gz')
         test_output = 'test.txt'
+        expected_output_filename = 'test.txt.gz'
         expected_output_directory = os.path.join('.', 'CAPICE_example')
         self.input_checker.check_input_output_directories(input_path=test_input, output_path=test_output)
-        self.assertEqual(self.input_checker.get_output_filename(), test_output)
+        self.assertEqual(self.input_checker.get_output_filename(), expected_output_filename)
         self.assertEqual(self.input_checker.get_output_directory(), expected_output_directory)
 
     def test_log_checker_both(self):

diff --git a/src/test/python/test_main_nontrain.py b/src/test/python/test_main_nontrain.py
@@ -36,8 +36,8 @@ def test_integration_main_nontrain(self):
                     input_loc=infile,
                     output_loc=self.output_dir)
         main.run()
-        prediction_output = pd.read_csv(os.path.join(self.output_dir, 'test_output.txt'), sep='\t')
-        self.assertEqual(prediction_output.shape, (20, 7))
+        prediction_output = pd.read_csv(os.path.join(self.output_dir, 'test_output.txt'), compression='gzip', sep='\t')
+        self.assertEqual(prediction_output.shape, (20, 5))
 
 
 if __name__ == '__main__':