Skip to content

Commit

Permalink
Merge pull request #35 from SietsmaRJ/master
Browse files Browse the repository at this point in the history
Exposed transcript identifier in CAPICE output
  • Loading branch information
marikaris authored May 28, 2021
2 parents b520d41 + 2469fd6 commit 0e3bbac
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 61 deletions.
18 changes: 14 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ The following sections will guide you through the steps needed for the variant a
making predictions using the CAPICE model.

### Download and installation (UNIX like systems)
__Note: this install is for Python 3.7 and Python 3.8.
__Note: this install is for Python 3.7, Python 3.8 and Python 3.9.
Python 3.6 is also supported and install can be found at the bottom of this chapter.
Python 3.5 and lower or Python 3.9 and higher is not supported (yet).__
Python 3.5 and lower is not supported.__

1. Software and libraries
CAPICE scripts can be downloaded from the CAPICE github repository.
Expand Down Expand Up @@ -95,7 +95,18 @@ CAPICE requires the following arguments:
- -i / --input: The path to the input [CADD annotated](https://cadd.gs.washington.edu/) dataset using the tab separator (can be both gzipped or not). An example of an input TSV file can be found in `CAPICE_example/test_cadd14_grch37_annotated.tsv.gz` for CADD 1.4 and genome build 37.

The following flags are optional:
- -o / --output: The path to the directory, output filename or output directory and filename where the output is placed (will be made if it does not exists). If only a filename is supplied, or no output is supplied, the file will be placed within the input directory. __The file will always be gzipped!__
- -o / --output: The path to the directory, output filename or output directory and filename where the output is placed (will be made if it does not exists). If only a filename is supplied, or no output is supplied, the file will be placed within the input directory. __The file will always be gzipped with a .gz extension!__

_For instance:_

`-i input.txt` becomes `input_capice.txt.gz`

`-i input.txt -o output.txt` becomes `output.txt.gz`

`-i input.txt -o path/to/output.txt` becomes `path/to/output.txt.gz`

`-i input.txt -o path/to/output` becomes `path/to/output/input_capice.txt.gz`

- -v / --verbose: Display more in depth messages within the progress of CAPICE.
- -f / --force: Overwrite an output file if already present (does NOT work for logfiles).
- --train: Activates the 'train new CAPICE-like models' within CAPICE.
Expand All @@ -108,7 +119,6 @@ A file will be put out containing the following columns:

- __No__ index
- chr_pos_ref_alt: column containing the chromosome, position, reference and alternative separated by an underscore.
- ID: Column full of `.`.
- GeneName: The ENSEMBL gene name of the variant as supplied by CADD.
- FeatureID: The ENSEMBL feature ID (Transcript ID or regulatory feature ID).
- Consequence: The type of consequence that the variant has as supplied by CADD.
Expand Down
28 changes: 1 addition & 27 deletions src/main/python/core/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import os
import pandas as pd
import pickle
import warnings


class Exporter:
Expand All @@ -31,34 +30,9 @@ def export_capice_prediction(self, datafile: pd.DataFrame):
:param datafile: prediction pandas DataFrame
"""
filename = self._export_filename_ready(file_name=self.capice_filename, check_extension=False)
# datafile[self.export_cols].to_csv(filename, sep='\t', index=False)
datafile = self._export_legacy_prediction(datafile=datafile)
datafile.to_csv(filename, sep='\t', index=False)
datafile[self.export_cols].to_csv(filename, sep='\t', compression='gzip', index=False)
self.log.info('Successfully exported CAPICE datafile to: {}'.format(filename))

def _export_legacy_prediction(self, datafile):
warnings.warn('Using legacy export function, deprecated in 2.1.', DeprecationWarning)
datafile = datafile[self.export_cols]

# Required to prevent the SettingWithCopyWarning, even when using:
# dataframe.loc[row_indexer,col_indexer] = value
pd.options.mode.chained_assignment = None

datafile.loc[:, Column.prediction.value] = 'empty'
datafile.loc[:, Column.combined_prediction.value] = 'empty'
datafile.loc[:, Column.PHRED.value] = 0.0
datafile.drop(columns=Column.FeatureID.value, inplace=True)
datafile = datafile[
[Column.chr_pos_ref_alt.value,
Column.GeneName.value,
Column.Consequence.value,
Column.PHRED.value,
Column.probabilities.value,
Column.prediction.value,
Column.combined_prediction.value]
]
return datafile

def export_capice_training_dataset(self, datafile: pd.DataFrame, name: str, feature: str):
"""
Function specific to export a (splitted) dataset comming from the training pathway.
Expand Down
2 changes: 1 addition & 1 deletion src/main/python/core/input_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def check_input_output_directories(self, input_path, output_path):
# Then I know it's an output filename
self.output_directory = os.path.dirname(input_path)
self.output_filename = output_path
# self._check_gzip_extension()
self._check_gzip_extension()

def _create_capice_output_filename(self, input_path, output_path=None, append_capice=True, ispath=False):
if output_path is None:
Expand Down
3 changes: 0 additions & 3 deletions src/main/python/resources/enums/sections.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,3 @@ class Column(Enum):
FeatureID = 'FeatureID'
Consequence = 'Consequence'
probabilities = 'probabilities'
prediction = 'prediction'
combined_prediction = 'combined_prediction'
PHRED = 'PHRED'
2 changes: 1 addition & 1 deletion src/main/python/resources/preprocessors/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _raise_no_module_found_error(self):
Specialized function to be used into _load_preprocessors() and _load_correct_preprocessor() to be raised when
no preprocessing files can be found.
"""
error_message = 'No usable python files are found within the imputing directory!'
error_message = 'No usable python files are found within the model directory!'
self.log.critical(error_message)
raise FileNotFoundError(error_message)

Expand Down
26 changes: 6 additions & 20 deletions src/test/python/core/test_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,6 @@ def setUpClass(cls):
Column.Consequence.value: ['Synonymous', 'Frame-shift'],
Column.probabilities.value: [0.01, 0.998]
})
cls.legacy_export_prediction = pd.DataFrame(
{
Column.chr_pos_ref_alt.value: ['1_100_A_C', '2_200_T_G'],
Column.GeneName.value: ['foo', 'bar'],
Column.Consequence.value: ['Synonymous', 'Frame-shift'],
Column.PHRED.value: [0.0, 0.0],
Column.probabilities.value: [0.01, 0.998],
Column.prediction.value: ['empty', 'empty'],
Column.combined_prediction.value: ['empty', 'empty']
}
)
cls.export_dataset = pd.DataFrame(
{
'chr': [1, 2],
Expand All @@ -53,13 +42,8 @@ def test_prediction_output(self):
print('Prediction output')
self.exporter.capice_filename = 'test_output.tsv'
self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe)
exported_data = pd.read_csv(os.path.join(self.output_loc, 'test_output.tsv'), sep='\t')
pd.testing.assert_frame_equal(exported_data, self.legacy_export_prediction)

def test_legacy_conversion(self):
print('Legacy output conversion')
converted_legacy = self.exporter._export_legacy_prediction(datafile=self.prediction_output_dataframe)
pd.testing.assert_frame_equal(converted_legacy, self.legacy_export_prediction)
exported_data = pd.read_csv(os.path.join(self.output_loc, 'test_output.tsv'), compression='gzip', sep='\t')
pd.testing.assert_frame_equal(exported_data, self.prediction_output_dataframe)

def test_dataset_export(self):
print('Dataset export')
Expand All @@ -85,8 +69,10 @@ def test_exporter_force(self):
self.exporter.force = True
self.exporter.capice_filename = 'already_present_file.tsv'
self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe)
forced_file = pd.read_csv(os.path.join(self.output_loc, 'already_present_file.tsv'), sep='\t')
pd.testing.assert_frame_equal(forced_file, self.legacy_export_prediction)
forced_file = pd.read_csv(
os.path.join(self.output_loc, 'already_present_file.tsv'), compression='gzip', sep='\t'
)
pd.testing.assert_frame_equal(forced_file, self.prediction_output_dataframe)


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions src/test/python/core/test_input_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ def test_input_output_conversion_case3(self):
print('Input output conversion (input + output directory + filename)')
test_input = os.path.join('.', 'CAPICE_example', 'test_cadd14_grch37_annotated.tsv.gz')
test_output = os.path.join('.', 'test_output', 'test.txt')
# expected_output_filename = 'test.txt.gz'
expected_output_filename = 'test.txt' # Legacy support, if legacy is disabled can be removed.
expected_output_filename = 'test.txt.gz'
expected_output_directory = os.path.join('.', 'test_output')
self.input_checker.check_input_output_directories(input_path=test_input, output_path=test_output)
self.assertEqual(self.input_checker.get_output_filename(), expected_output_filename)
Expand All @@ -97,9 +96,10 @@ def test_input_output_conversion_case4(self):
print('Input output conversion (input + filename)')
test_input = os.path.join('.', 'CAPICE_example', 'test_cadd14_grch37_annotated.tsv.gz')
test_output = 'test.txt'
expected_output_filename = 'test.txt.gz'
expected_output_directory = os.path.join('.', 'CAPICE_example')
self.input_checker.check_input_output_directories(input_path=test_input, output_path=test_output)
self.assertEqual(self.input_checker.get_output_filename(), test_output)
self.assertEqual(self.input_checker.get_output_filename(), expected_output_filename)
self.assertEqual(self.input_checker.get_output_directory(), expected_output_directory)

def test_log_checker_both(self):
Expand Down
4 changes: 2 additions & 2 deletions src/test/python/test_main_nontrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def test_integration_main_nontrain(self):
input_loc=infile,
output_loc=self.output_dir)
main.run()
prediction_output = pd.read_csv(os.path.join(self.output_dir, 'test_output.txt'), sep='\t')
self.assertEqual(prediction_output.shape, (20, 7))
prediction_output = pd.read_csv(os.path.join(self.output_dir, 'test_output.txt'), compression='gzip', sep='\t')
self.assertEqual(prediction_output.shape, (20, 5))


if __name__ == '__main__':
Expand Down

0 comments on commit 0e3bbac

Please sign in to comment.